v0.1.0

2026-06-03 21:03:22 -07:00
parent ddbf944e79
commit 05f2651f4f
13 changed files with 1693 additions and 1 deletions
--- a/.python-version
+++ b/.python-version
@@ -0,0 +1 @@
 3.12
--- a/.serena/.gitignore
+++ b/.serena/.gitignore
@@ -0,0 +1,2 @@
 /cache
 /project.local.yml
--- a/.serena/project.yml
+++ b/.serena/project.yml
@@ -0,0 +1,132 @@
 # the name by which the project can be referenced within Serena
 project_name: "apify-shared"
 # list of languages for which language servers are started; choose from:
 #   al                  angular             ansible             bash                clojure
 #   cpp                 cpp_ccls            crystal             csharp              csharp_omnisharp
 #   dart                elixir              elm                 erlang              fortran
 #   fsharp              go                  groovy              haskell             haxe
 #   hlsl                html                java                json                julia
 #   kotlin              lean4               lua                 luau                markdown
 #   matlab              msl                 nix                 ocaml               pascal
 #   perl                php                 php_phpactor        powershell          python
 #   python_jedi         python_ty           r                   rego                ruby
 #   ruby_solargraph     rust                scala               scss                solidity
 #   svelte              swift               systemverilog       terraform           toml
 #   typescript          typescript_vts      vue                 yaml                zig
 #   (This list may be outdated. For the current list, see values of Language enum here:
 #   https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
 #   For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
 # Note:
 #   - For C, use cpp
 #   - For JavaScript, use typescript
 #   - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
 #   - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
 #   - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
 #   - For Free Pascal/Lazarus, use pascal
 # Special requirements:
 #   Some languages require additional setup/installations.
 #   See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
 # When using multiple languages, the first language server that supports a given file will be used for that file.
 # The first language is the default language and the respective language server will be used as a fallback.
 # Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
 languages: []
 # the encoding used by text files in the project
 # For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
 encoding: "utf-8"
 # line ending convention to use when writing source files.
 # Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
 # This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
 line_ending:
 # The language backend to use for this project.
 # If not set, the global setting from serena_config.yml is used.
 # Valid values: LSP, JetBrains
 # Note: the backend is fixed at startup. If a project with a different backend
 # is activated post-init, an error will be returned.
 language_backend:
 # whether to use project's .gitignore files to ignore files
 ignore_all_files_in_gitignore: true
 # advanced configuration option allowing to configure language server-specific options.
 # Maps the language key to the options.
 # Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
 # No documentation on options means no options are available.
 ls_specific_settings: {}
 # list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
 # Paths can be absolute or relative to the project root.
 # Each folder is registered as an LSP workspace folder, enabling language servers to discover
 # symbols and references across package boundaries.
 # Currently supported for: TypeScript.
 # Example:
 #   additional_workspace_folders:
 #     - ../sibling-package
 #     - ../shared-lib
 additional_workspace_folders: []
 # list of additional paths to ignore in this project.
 # Same syntax as gitignore, so you can use * and **.
 # Note: global ignored_paths from serena_config.yml are also applied additively.
 ignored_paths: []
 # whether the project is in read-only mode
 # If set to true, all editing tools will be disabled and attempts to use them will result in an error
 # Added on 2025-04-18
 read_only: false
 # list of tool names to exclude.
 # This extends the existing exclusions (e.g. from the global configuration)
 # Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
 excluded_tools: []
 # list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
 # This extends the existing inclusions (e.g. from the global configuration).
 # Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
 included_optional_tools: []
 # fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
 # This cannot be combined with non-empty excluded_tools or included_optional_tools.
 # Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
 fixed_tools: []
 # list of mode names that are to be activated by default, overriding the setting in the global configuration.
 # The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
 # If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
 # Otherwise, this overrides the setting from the global configuration (serena_config.yml).
 # Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
 # for this project.
 # This setting can, in turn, be overridden by CLI parameters (--mode).
 # See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
 default_modes:
 # list of mode names to be activated additionally for this project, e.g. ["query-projects"]
 # The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
 # See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
 added_modes:
 # initial prompt for the project. It will always be given to the LLM upon activating the project
 # (contrary to the memories, which are loaded on demand).
 initial_prompt: ""
 # time budget (seconds) per tool call for the retrieval of additional symbol information
 # such as docstrings or parameter information.
 # This overrides the corresponding setting in the global configuration; see the documentation there.
 # If null or missing, use the setting from the global configuration.
 symbol_info_budget:
 # list of regex patterns which, when matched, mark a memory entry as read‑only.
 # Extends the list from the global configuration, merging the two lists.
 read_only_memory_patterns: []
 # list of regex patterns for memories to completely ignore.
 # Matching memories will not appear in list_memories or activate_project output
 # and cannot be accessed via read_memory or write_memory.
 # To access ignored memory files, use the read_file tool on the raw file path.
 # Extends the list from the global configuration, merging the two lists.
 # Example: ["_archive/.*", "_episodes/.*"]
 ignored_memory_patterns: []
--- a/README.md
+++ b/README.md
@@ -1,3 +1,63 @@
 # apify-shared
-Shared utility package for use with Apify actors
+Shared utility package for use with Apify actors.
 ## Setup
 ```bash
 uv sync
 ```
 ## Project structure
 ```
 src/apify_shared/   # library source code
 pyproject.toml      # package metadata and dependencies
 uv.lock             # pinned dependency versions
 .python-version     # pinned Python version (3.12)
 ```
 Add new modules under `src/apify_shared/` and re-export from `__init__.py` to expose them as part of the public API.
 ## Managing dependencies
 ```bash
 uv add httpx                  # add a runtime dependency
 uv add --dev pytest           # add a dev-only dependency
 uv remove httpx               # remove a dependency
 uv lock --upgrade             # upgrade all dependencies to latest
 uv lock --upgrade-package httpx  # upgrade a single package
 uv sync                       # install/update local venv from lock file
 ```
 ## Using as a dependency in another project
 ### From Gitea (recommended)
 In the consuming project's `pyproject.toml`:
 ```toml
 [project]
 dependencies = ["apify-shared"]
 [tool.uv.sources]
 apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared" }
 ```
 Pin to a specific tag, branch, or commit:
 ```toml
 apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", tag = "v0.1.0" }
 apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", branch = "main" }
 apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", rev = "abc123" }
 ```
 ### From a local path (development)
 ```toml
 [tool.uv.sources]
 apify-shared = { path = "../apify-shared", editable = true }
 ```
 Then run `uv sync` in the consuming project.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,17 @@
 [project]
 name = "apify-shared"
 version = "0.1.0"
 description = "Shared utility package for use with Apify actors"
 authors = [{ name = "Ryan Byrne", email = "ryanjbyrne30@gmail.com" }]
 requires-python = ">=3.12"
 dependencies = [
    "boto3>=1.35",
    "httpx>=0.28.1",
    "tenacity>=9.1.4",
    "zenrows>=1.0",
    "apify>=2.0",
 ]
 [build-system]
 requires = ["uv_build>=0.9.18,<0.10.0"]
 build-backend = "uv_build"
--- a/src/apify_shared/init.py
+++ b/src/apify_shared/init.py
@@ -0,0 +1,2 @@
 def hello() -> str:
    return "Hello from apify-shared!"
--- a/src/apify_shared/data_store.py
+++ b/src/apify_shared/data_store.py
@@ -0,0 +1,86 @@
 import asyncio
 import csv
 import io
 import json
 import boto3
 from dataclasses import dataclass
 from typing import Any
 import hashlib
@dataclass
 class DataStoreConfig:
    bucket: str
    aws_access_key_id: str | None = None
    aws_secret_access_key: str | None = None
    aws_region: str = "us-east-1"
 class DataStore:
    def __init__(self, config: DataStoreConfig):
        self.config = config
        kwargs: dict = {"region_name": config.aws_region}
        if config.aws_access_key_id and config.aws_secret_access_key:
            kwargs["aws_access_key_id"] = config.aws_access_key_id
            kwargs["aws_secret_access_key"] = config.aws_secret_access_key
        self._client = boto3.client("s3", **kwargs)
    async def upload_json(self, key: str, data: Any):
        body = json.dumps(data, indent=2).encode("utf-8")
        await self.__upload(key, body, "application/json")
    async def upload_xml(self, key: str, data: Any):
        if isinstance(data, bytes):
            body = data
        elif isinstance(data, str):
            body = data.encode("utf-8")
        else:
            body = str(data).encode("utf-8")
        await self.__upload(key, body, "application/xml")
    async def upload_html(
        self, key: str, data: str | bytes, metadata: dict[str, object] = {}
    ):
        body = data.encode("utf-8") if isinstance(data, str) else data
        await self.__upload(key, body, "text/html; charset=utf-8", metadata=metadata)
    async def upload_txt(self, key: str, data: Any):
        if isinstance(data, bytes):
            body = data
        else:
            body = str(data).encode("utf-8")
        await self.__upload(key, body, "text/plain")
    async def upload_csv(self, key: str, data: Any):
        if isinstance(data, bytes):
            body = data
        elif isinstance(data, str):
            body = data.encode("utf-8")
        elif isinstance(data, list):
            buf = io.StringIO()
            if data and isinstance(data[0], dict):
                writer = csv.DictWriter(buf, fieldnames=data[0].keys())
                writer.writeheader()
                writer.writerows(data)
            else:
                writer = csv.writer(buf)
                writer.writerows(data)
            body = buf.getvalue().encode("utf-8")
        else:
            raise TypeError(f"Unsupported data type for CSV upload: {type(data)}")
        await self.__upload(key, body, "text/csv")
    async def __upload(
        self, key: str, body: bytes, content_type: str, metadata: dict[str, object] = {}
    ):
        await asyncio.to_thread(
            self._client.put_object,
            Bucket=self.config.bucket,
            Key=key,
            Body=body,
            ContentType=content_type,
            Metadata=metadata,
        )
    def hash(self, content: str):
        return hashlib.md5(content.encode()).hexdigest()
--- a/src/apify_shared/fetch.py
+++ b/src/apify_shared/fetch.py
@@ -0,0 +1,66 @@
 import logging
 from dataclasses import dataclass
 import httpx
 from tenacity import (
    AsyncRetrying,
    before_sleep_log,
    retry_if_exception,
    stop_after_attempt,
    wait_exponential,
 )
 logger = logging.getLogger(__name__)
 def _is_retryable(exc: BaseException) -> bool:
    if isinstance(exc, httpx.TimeoutException):
        return True
    if isinstance(exc, httpx.HTTPStatusError):
        return exc.response.status_code in (429, 500, 502, 503, 504)
    return False
@dataclass
 class FetchConfig:
    timeout: float = 30.0
    max_attempts: int = 3
    backoff_min: float = 2.0
    backoff_max: float = 30.0
 class FetchClient:
    def __init__(self, config: FetchConfig | None = None):
        self._config = config or FetchConfig()
        self._client: httpx.AsyncClient | None = None
    async def __aenter__(self) -> "FetchClient":
        self._client = await httpx.AsyncClient(
            timeout=self._config.timeout
        ).__aenter__()
        return self
    async def __aexit__(self, *args) -> None:
        if self._client:
            await self._client.__aexit__(*args)
    async def get(self, url: str, **kwargs) -> httpx.Response:
        return await self._request("GET", url, **kwargs)
    async def post(self, url: str, **kwargs) -> httpx.Response:
        return await self._request("POST", url, **kwargs)
    async def _request(self, method: str, url: str, **kwargs) -> httpx.Response:
        assert self._client is not None, "FetchClient must be used as a context manager"
        cfg = self._config
        async for attempt in AsyncRetrying(
            retry=retry_if_exception(_is_retryable),
            stop=stop_after_attempt(cfg.max_attempts),
            wait=wait_exponential(min=cfg.backoff_min, max=cfg.backoff_max),
            before_sleep=before_sleep_log(logger, logging.WARNING),
            reraise=True,
        ):
            with attempt:
                response = await self._client.request(method, url, **kwargs)
                response.raise_for_status()
                return response
--- a/src/apify_shared/logger.py
+++ b/src/apify_shared/logger.py
@@ -0,0 +1,18 @@
 from apify import Actor
 class Logger:
    def info(self, msg: object, *args: object):
        Actor.log.info(msg, args)
    def debug(self, msg: object, *args: object):
        Actor.log.debug(msg, args)
    def error(self, msg: object, *args: object):
        Actor.log.error(msg, args)
    def fatal(self, msg: object, *args: object):
        Actor.log.fatal(msg, args)
    def warn(self, msg: object, *args: object):
        Actor.log.warning(msg, args)
--- a/src/apify_shared/py.typed
+++ b/src/apify_shared/py.typed
--- a/src/apify_shared/utils.py
+++ b/src/apify_shared/utils.py
@@ -0,0 +1,10 @@
 from datetime import datetime, timezone
 def timestamp_str():
    return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
 def date_str(iso_date: str) -> str:
    """Normalize an ISO 8601 date/datetime string to YYYYMMDD for use in S3 keys."""
    return iso_date[:10].replace("-", "")
--- a/src/apify_shared/zenrows.py
+++ b/src/apify_shared/zenrows.py
@@ -0,0 +1,19 @@
 from virtek_apify.logger import Logger
 import time
 from zenrows import ZenRowsClient
 class ZenRowsScraper:
    def __init__(self, logger: Logger, api_key: str):
        self.client = ZenRowsClient(api_key)
        self.logger = logger
    def scrape(self, url: str):
        self.logger.debug(f"ZenRows scraping url: {url}...")
        start = time.monotonic()
        response = self.client.get(url, params={"mode": "auto"})
        elapsed_time = round((time.monotonic() - start) * 1000)
        self.logger.debug(
            f"ZenRows scraped url: {url}. Returned status: {response.status_code} in {elapsed_time}ms"
        )
        return response
--- a/uv.lock
+++ b/uv.lock
		`@@ -0,0 +1,2 @@`
							`def hello() -> str:`
							`return "Hello from apify-shared!"`