This commit is contained in:
2026-06-03 21:03:22 -07:00
parent ddbf944e79
commit 05f2651f4f
13 changed files with 1693 additions and 1 deletions

1
.python-version Normal file
View File

@@ -0,0 +1 @@
3.12

2
.serena/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
/cache
/project.local.yml

132
.serena/project.yml Normal file
View File

@@ -0,0 +1,132 @@
# the name by which the project can be referenced within Serena
project_name: "apify-shared"
# list of languages for which language servers are started; choose from:
# al angular ansible bash clojure
# cpp cpp_ccls crystal csharp csharp_omnisharp
# dart elixir elm erlang fortran
# fsharp go groovy haskell haxe
# hlsl html java json julia
# kotlin lean4 lua luau markdown
# matlab msl nix ocaml pascal
# perl php php_phpactor powershell python
# python_jedi python_ty r rego ruby
# ruby_solargraph rust scala scss solidity
# svelte swift systemverilog terraform toml
# typescript typescript_vts vue yaml zig
# (This list may be outdated. For the current list, see values of Language enum here:
# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
# Note:
# - For C, use cpp
# - For JavaScript, use typescript
# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
# - For Free Pascal/Lazarus, use pascal
# Special requirements:
# Some languages require additional setup/installations.
# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
# When using multiple languages, the first language server that supports a given file will be used for that file.
# The first language is the default language and the respective language server will be used as a fallback.
# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
languages: []
# the encoding used by text files in the project
# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
encoding: "utf-8"
# line ending convention to use when writing source files.
# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
line_ending:
# The language backend to use for this project.
# If not set, the global setting from serena_config.yml is used.
# Valid values: LSP, JetBrains
# Note: the backend is fixed at startup. If a project with a different backend
# is activated post-init, an error will be returned.
language_backend:
# whether to use project's .gitignore files to ignore files
ignore_all_files_in_gitignore: true
# advanced configuration option allowing to configure language server-specific options.
# Maps the language key to the options.
# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
# No documentation on options means no options are available.
ls_specific_settings: {}
# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
# Paths can be absolute or relative to the project root.
# Each folder is registered as an LSP workspace folder, enabling language servers to discover
# symbols and references across package boundaries.
# Currently supported for: TypeScript.
# Example:
# additional_workspace_folders:
# - ../sibling-package
# - ../shared-lib
additional_workspace_folders: []
# list of additional paths to ignore in this project.
# Same syntax as gitignore, so you can use * and **.
# Note: global ignored_paths from serena_config.yml are also applied additively.
ignored_paths: []
# whether the project is in read-only mode
# If set to true, all editing tools will be disabled and attempts to use them will result in an error
# Added on 2025-04-18
read_only: false
# list of tool names to exclude.
# This extends the existing exclusions (e.g. from the global configuration)
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
excluded_tools: []
# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
# This extends the existing inclusions (e.g. from the global configuration).
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
included_optional_tools: []
# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
# This cannot be combined with non-empty excluded_tools or included_optional_tools.
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
fixed_tools: []
# list of mode names that are to be activated by default, overriding the setting in the global configuration.
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
# for this project.
# This setting can, in turn, be overridden by CLI parameters (--mode).
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
default_modes:
# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
added_modes:
# initial prompt for the project. It will always be given to the LLM upon activating the project
# (contrary to the memories, which are loaded on demand).
initial_prompt: ""
# time budget (seconds) per tool call for the retrieval of additional symbol information
# such as docstrings or parameter information.
# This overrides the corresponding setting in the global configuration; see the documentation there.
# If null or missing, use the setting from the global configuration.
symbol_info_budget:
# list of regex patterns which, when matched, mark a memory entry as readonly.
# Extends the list from the global configuration, merging the two lists.
read_only_memory_patterns: []
# list of regex patterns for memories to completely ignore.
# Matching memories will not appear in list_memories or activate_project output
# and cannot be accessed via read_memory or write_memory.
# To access ignored memory files, use the read_file tool on the raw file path.
# Extends the list from the global configuration, merging the two lists.
# Example: ["_archive/.*", "_episodes/.*"]
ignored_memory_patterns: []

View File

@@ -1,3 +1,63 @@
# apify-shared # apify-shared
Shared utility package for use with Apify actors Shared utility package for use with Apify actors.
## Setup
```bash
uv sync
```
## Project structure
```
src/apify_shared/ # library source code
pyproject.toml # package metadata and dependencies
uv.lock # pinned dependency versions
.python-version # pinned Python version (3.12)
```
Add new modules under `src/apify_shared/` and re-export from `__init__.py` to expose them as part of the public API.
## Managing dependencies
```bash
uv add httpx # add a runtime dependency
uv add --dev pytest # add a dev-only dependency
uv remove httpx # remove a dependency
uv lock --upgrade # upgrade all dependencies to latest
uv lock --upgrade-package httpx # upgrade a single package
uv sync # install/update local venv from lock file
```
## Using as a dependency in another project
### From Gitea (recommended)
In the consuming project's `pyproject.toml`:
```toml
[project]
dependencies = ["apify-shared"]
[tool.uv.sources]
apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared" }
```
Pin to a specific tag, branch, or commit:
```toml
apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", tag = "v0.1.0" }
apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", branch = "main" }
apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", rev = "abc123" }
```
### From a local path (development)
```toml
[tool.uv.sources]
apify-shared = { path = "../apify-shared", editable = true }
```
Then run `uv sync` in the consuming project.

17
pyproject.toml Normal file
View File

@@ -0,0 +1,17 @@
[project]
name = "apify-shared"
version = "0.1.0"
description = "Shared utility package for use with Apify actors"
authors = [{ name = "Ryan Byrne", email = "ryanjbyrne30@gmail.com" }]
requires-python = ">=3.12"
dependencies = [
"boto3>=1.35",
"httpx>=0.28.1",
"tenacity>=9.1.4",
"zenrows>=1.0",
"apify>=2.0",
]
[build-system]
requires = ["uv_build>=0.9.18,<0.10.0"]
build-backend = "uv_build"

View File

@@ -0,0 +1,2 @@
def hello() -> str:
return "Hello from apify-shared!"

View File

@@ -0,0 +1,86 @@
import asyncio
import csv
import io
import json
import boto3
from dataclasses import dataclass
from typing import Any
import hashlib
@dataclass
class DataStoreConfig:
bucket: str
aws_access_key_id: str | None = None
aws_secret_access_key: str | None = None
aws_region: str = "us-east-1"
class DataStore:
def __init__(self, config: DataStoreConfig):
self.config = config
kwargs: dict = {"region_name": config.aws_region}
if config.aws_access_key_id and config.aws_secret_access_key:
kwargs["aws_access_key_id"] = config.aws_access_key_id
kwargs["aws_secret_access_key"] = config.aws_secret_access_key
self._client = boto3.client("s3", **kwargs)
async def upload_json(self, key: str, data: Any):
body = json.dumps(data, indent=2).encode("utf-8")
await self.__upload(key, body, "application/json")
async def upload_xml(self, key: str, data: Any):
if isinstance(data, bytes):
body = data
elif isinstance(data, str):
body = data.encode("utf-8")
else:
body = str(data).encode("utf-8")
await self.__upload(key, body, "application/xml")
async def upload_html(
self, key: str, data: str | bytes, metadata: dict[str, object] = {}
):
body = data.encode("utf-8") if isinstance(data, str) else data
await self.__upload(key, body, "text/html; charset=utf-8", metadata=metadata)
async def upload_txt(self, key: str, data: Any):
if isinstance(data, bytes):
body = data
else:
body = str(data).encode("utf-8")
await self.__upload(key, body, "text/plain")
async def upload_csv(self, key: str, data: Any):
if isinstance(data, bytes):
body = data
elif isinstance(data, str):
body = data.encode("utf-8")
elif isinstance(data, list):
buf = io.StringIO()
if data and isinstance(data[0], dict):
writer = csv.DictWriter(buf, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
else:
writer = csv.writer(buf)
writer.writerows(data)
body = buf.getvalue().encode("utf-8")
else:
raise TypeError(f"Unsupported data type for CSV upload: {type(data)}")
await self.__upload(key, body, "text/csv")
async def __upload(
self, key: str, body: bytes, content_type: str, metadata: dict[str, object] = {}
):
await asyncio.to_thread(
self._client.put_object,
Bucket=self.config.bucket,
Key=key,
Body=body,
ContentType=content_type,
Metadata=metadata,
)
def hash(self, content: str):
return hashlib.md5(content.encode()).hexdigest()

66
src/apify_shared/fetch.py Normal file
View File

@@ -0,0 +1,66 @@
import logging
from dataclasses import dataclass
import httpx
from tenacity import (
AsyncRetrying,
before_sleep_log,
retry_if_exception,
stop_after_attempt,
wait_exponential,
)
logger = logging.getLogger(__name__)
def _is_retryable(exc: BaseException) -> bool:
if isinstance(exc, httpx.TimeoutException):
return True
if isinstance(exc, httpx.HTTPStatusError):
return exc.response.status_code in (429, 500, 502, 503, 504)
return False
@dataclass
class FetchConfig:
timeout: float = 30.0
max_attempts: int = 3
backoff_min: float = 2.0
backoff_max: float = 30.0
class FetchClient:
def __init__(self, config: FetchConfig | None = None):
self._config = config or FetchConfig()
self._client: httpx.AsyncClient | None = None
async def __aenter__(self) -> "FetchClient":
self._client = await httpx.AsyncClient(
timeout=self._config.timeout
).__aenter__()
return self
async def __aexit__(self, *args) -> None:
if self._client:
await self._client.__aexit__(*args)
async def get(self, url: str, **kwargs) -> httpx.Response:
return await self._request("GET", url, **kwargs)
async def post(self, url: str, **kwargs) -> httpx.Response:
return await self._request("POST", url, **kwargs)
async def _request(self, method: str, url: str, **kwargs) -> httpx.Response:
assert self._client is not None, "FetchClient must be used as a context manager"
cfg = self._config
async for attempt in AsyncRetrying(
retry=retry_if_exception(_is_retryable),
stop=stop_after_attempt(cfg.max_attempts),
wait=wait_exponential(min=cfg.backoff_min, max=cfg.backoff_max),
before_sleep=before_sleep_log(logger, logging.WARNING),
reraise=True,
):
with attempt:
response = await self._client.request(method, url, **kwargs)
response.raise_for_status()
return response

View File

@@ -0,0 +1,18 @@
from apify import Actor
class Logger:
def info(self, msg: object, *args: object):
Actor.log.info(msg, args)
def debug(self, msg: object, *args: object):
Actor.log.debug(msg, args)
def error(self, msg: object, *args: object):
Actor.log.error(msg, args)
def fatal(self, msg: object, *args: object):
Actor.log.fatal(msg, args)
def warn(self, msg: object, *args: object):
Actor.log.warning(msg, args)

View File

10
src/apify_shared/utils.py Normal file
View File

@@ -0,0 +1,10 @@
from datetime import datetime, timezone
def timestamp_str():
return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
def date_str(iso_date: str) -> str:
"""Normalize an ISO 8601 date/datetime string to YYYYMMDD for use in S3 keys."""
return iso_date[:10].replace("-", "")

View File

@@ -0,0 +1,19 @@
from virtek_apify.logger import Logger
import time
from zenrows import ZenRowsClient
class ZenRowsScraper:
def __init__(self, logger: Logger, api_key: str):
self.client = ZenRowsClient(api_key)
self.logger = logger
def scrape(self, url: str):
self.logger.debug(f"ZenRows scraping url: {url}...")
start = time.monotonic()
response = self.client.get(url, params={"mode": "auto"})
elapsed_time = round((time.monotonic() - start) * 1000)
self.logger.debug(
f"ZenRows scraped url: {url}. Returned status: {response.status_code} in {elapsed_time}ms"
)
return response

1279
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff