v0.1.0
This commit is contained in:
1
.python-version
Normal file
1
.python-version
Normal file
@@ -0,0 +1 @@
|
|||||||
|
3.12
|
||||||
2
.serena/.gitignore
vendored
Normal file
2
.serena/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
/cache
|
||||||
|
/project.local.yml
|
||||||
132
.serena/project.yml
Normal file
132
.serena/project.yml
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
# the name by which the project can be referenced within Serena
|
||||||
|
project_name: "apify-shared"
|
||||||
|
|
||||||
|
|
||||||
|
# list of languages for which language servers are started; choose from:
|
||||||
|
# al angular ansible bash clojure
|
||||||
|
# cpp cpp_ccls crystal csharp csharp_omnisharp
|
||||||
|
# dart elixir elm erlang fortran
|
||||||
|
# fsharp go groovy haskell haxe
|
||||||
|
# hlsl html java json julia
|
||||||
|
# kotlin lean4 lua luau markdown
|
||||||
|
# matlab msl nix ocaml pascal
|
||||||
|
# perl php php_phpactor powershell python
|
||||||
|
# python_jedi python_ty r rego ruby
|
||||||
|
# ruby_solargraph rust scala scss solidity
|
||||||
|
# svelte swift systemverilog terraform toml
|
||||||
|
# typescript typescript_vts vue yaml zig
|
||||||
|
# (This list may be outdated. For the current list, see values of Language enum here:
|
||||||
|
# https://github.com/oraios/serena/blob/main/src/solidlsp/ls_config.py
|
||||||
|
# For some languages, there are alternative language servers, e.g. csharp_omnisharp, ruby_solargraph.)
|
||||||
|
# Note:
|
||||||
|
# - For C, use cpp
|
||||||
|
# - For JavaScript, use typescript
|
||||||
|
# - For Angular projects, use angular (subsumes typescript+html; requires `npm install` in the project root)
|
||||||
|
# - For Svelte projects, use svelte (subsumes typescript/javascript for .svelte projects; requires npm)
|
||||||
|
# - For SCSS / Sass / plain CSS, use scss (some-sass-language-server handles all three)
|
||||||
|
# - For Free Pascal/Lazarus, use pascal
|
||||||
|
# Special requirements:
|
||||||
|
# Some languages require additional setup/installations.
|
||||||
|
# See here for details: https://oraios.github.io/serena/01-about/020_programming-languages.html#language-servers
|
||||||
|
# When using multiple languages, the first language server that supports a given file will be used for that file.
|
||||||
|
# The first language is the default language and the respective language server will be used as a fallback.
|
||||||
|
# Note that when using the JetBrains backend, language servers are not used and this list is correspondingly ignored.
|
||||||
|
languages: []
|
||||||
|
|
||||||
|
# the encoding used by text files in the project
|
||||||
|
# For a list of possible encodings, see https://docs.python.org/3.11/library/codecs.html#standard-encodings
|
||||||
|
encoding: "utf-8"
|
||||||
|
|
||||||
|
# line ending convention to use when writing source files.
|
||||||
|
# Possible values: unset (use global setting), "lf", "crlf", or "native" (platform default)
|
||||||
|
# This does not affect Serena's own files (e.g. memories and configuration files), which always use native line endings.
|
||||||
|
line_ending:
|
||||||
|
|
||||||
|
# The language backend to use for this project.
|
||||||
|
# If not set, the global setting from serena_config.yml is used.
|
||||||
|
# Valid values: LSP, JetBrains
|
||||||
|
# Note: the backend is fixed at startup. If a project with a different backend
|
||||||
|
# is activated post-init, an error will be returned.
|
||||||
|
language_backend:
|
||||||
|
|
||||||
|
# whether to use project's .gitignore files to ignore files
|
||||||
|
ignore_all_files_in_gitignore: true
|
||||||
|
|
||||||
|
# advanced configuration option allowing to configure language server-specific options.
|
||||||
|
# Maps the language key to the options.
|
||||||
|
# Have a look at the docstring of the constructors of the LS implementations within solidlsp (e.g., for C# or PHP) to see which options are available.
|
||||||
|
# No documentation on options means no options are available.
|
||||||
|
ls_specific_settings: {}
|
||||||
|
|
||||||
|
# list of additional workspace folder paths for cross-package reference support (e.g. in monorepos).
|
||||||
|
# Paths can be absolute or relative to the project root.
|
||||||
|
# Each folder is registered as an LSP workspace folder, enabling language servers to discover
|
||||||
|
# symbols and references across package boundaries.
|
||||||
|
# Currently supported for: TypeScript.
|
||||||
|
# Example:
|
||||||
|
# additional_workspace_folders:
|
||||||
|
# - ../sibling-package
|
||||||
|
# - ../shared-lib
|
||||||
|
additional_workspace_folders: []
|
||||||
|
|
||||||
|
# list of additional paths to ignore in this project.
|
||||||
|
# Same syntax as gitignore, so you can use * and **.
|
||||||
|
# Note: global ignored_paths from serena_config.yml are also applied additively.
|
||||||
|
ignored_paths: []
|
||||||
|
|
||||||
|
# whether the project is in read-only mode
|
||||||
|
# If set to true, all editing tools will be disabled and attempts to use them will result in an error
|
||||||
|
# Added on 2025-04-18
|
||||||
|
read_only: false
|
||||||
|
|
||||||
|
# list of tool names to exclude.
|
||||||
|
# This extends the existing exclusions (e.g. from the global configuration)
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
excluded_tools: []
|
||||||
|
|
||||||
|
# list of tools to include that would otherwise be disabled (particularly optional tools that are disabled by default).
|
||||||
|
# This extends the existing inclusions (e.g. from the global configuration).
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
included_optional_tools: []
|
||||||
|
|
||||||
|
# fixed set of tools to use as the base tool set (if non-empty), replacing Serena's default set of tools.
|
||||||
|
# This cannot be combined with non-empty excluded_tools or included_optional_tools.
|
||||||
|
# Find the list of tools here: https://oraios.github.io/serena/01-about/035_tools.html
|
||||||
|
fixed_tools: []
|
||||||
|
|
||||||
|
# list of mode names that are to be activated by default, overriding the setting in the global configuration.
|
||||||
|
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||||
|
# If the setting is undefined/empty, the default_modes from the global configuration (serena_config.yml) apply.
|
||||||
|
# Otherwise, this overrides the setting from the global configuration (serena_config.yml).
|
||||||
|
# Therefore, you can set this to [] if you do not want the default modes defined in the global config to apply
|
||||||
|
# for this project.
|
||||||
|
# This setting can, in turn, be overridden by CLI parameters (--mode).
|
||||||
|
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||||
|
default_modes:
|
||||||
|
|
||||||
|
# list of mode names to be activated additionally for this project, e.g. ["query-projects"]
|
||||||
|
# The full set of modes to be activated is base_modes (from global config) + default_modes + added_modes.
|
||||||
|
# See https://oraios.github.io/serena/02-usage/050_configuration.html#modes
|
||||||
|
added_modes:
|
||||||
|
|
||||||
|
# initial prompt for the project. It will always be given to the LLM upon activating the project
|
||||||
|
# (contrary to the memories, which are loaded on demand).
|
||||||
|
initial_prompt: ""
|
||||||
|
|
||||||
|
# time budget (seconds) per tool call for the retrieval of additional symbol information
|
||||||
|
# such as docstrings or parameter information.
|
||||||
|
# This overrides the corresponding setting in the global configuration; see the documentation there.
|
||||||
|
# If null or missing, use the setting from the global configuration.
|
||||||
|
symbol_info_budget:
|
||||||
|
|
||||||
|
# list of regex patterns which, when matched, mark a memory entry as read‑only.
|
||||||
|
# Extends the list from the global configuration, merging the two lists.
|
||||||
|
read_only_memory_patterns: []
|
||||||
|
|
||||||
|
# list of regex patterns for memories to completely ignore.
|
||||||
|
# Matching memories will not appear in list_memories or activate_project output
|
||||||
|
# and cannot be accessed via read_memory or write_memory.
|
||||||
|
# To access ignored memory files, use the read_file tool on the raw file path.
|
||||||
|
# Extends the list from the global configuration, merging the two lists.
|
||||||
|
# Example: ["_archive/.*", "_episodes/.*"]
|
||||||
|
ignored_memory_patterns: []
|
||||||
62
README.md
62
README.md
@@ -1,3 +1,63 @@
|
|||||||
# apify-shared
|
# apify-shared
|
||||||
|
|
||||||
Shared utility package for use with Apify actors
|
Shared utility package for use with Apify actors.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv sync
|
||||||
|
```
|
||||||
|
|
||||||
|
## Project structure
|
||||||
|
|
||||||
|
```
|
||||||
|
src/apify_shared/ # library source code
|
||||||
|
pyproject.toml # package metadata and dependencies
|
||||||
|
uv.lock # pinned dependency versions
|
||||||
|
.python-version # pinned Python version (3.12)
|
||||||
|
```
|
||||||
|
|
||||||
|
Add new modules under `src/apify_shared/` and re-export from `__init__.py` to expose them as part of the public API.
|
||||||
|
|
||||||
|
## Managing dependencies
|
||||||
|
|
||||||
|
```bash
|
||||||
|
uv add httpx # add a runtime dependency
|
||||||
|
uv add --dev pytest # add a dev-only dependency
|
||||||
|
uv remove httpx # remove a dependency
|
||||||
|
|
||||||
|
uv lock --upgrade # upgrade all dependencies to latest
|
||||||
|
uv lock --upgrade-package httpx # upgrade a single package
|
||||||
|
uv sync # install/update local venv from lock file
|
||||||
|
```
|
||||||
|
|
||||||
|
## Using as a dependency in another project
|
||||||
|
|
||||||
|
### From Gitea (recommended)
|
||||||
|
|
||||||
|
In the consuming project's `pyproject.toml`:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[project]
|
||||||
|
dependencies = ["apify-shared"]
|
||||||
|
|
||||||
|
[tool.uv.sources]
|
||||||
|
apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared" }
|
||||||
|
```
|
||||||
|
|
||||||
|
Pin to a specific tag, branch, or commit:
|
||||||
|
|
||||||
|
```toml
|
||||||
|
apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", tag = "v0.1.0" }
|
||||||
|
apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", branch = "main" }
|
||||||
|
apify-shared = { git = "https://your-gitea-instance.com/virtek-labs/apify-shared", rev = "abc123" }
|
||||||
|
```
|
||||||
|
|
||||||
|
### From a local path (development)
|
||||||
|
|
||||||
|
```toml
|
||||||
|
[tool.uv.sources]
|
||||||
|
apify-shared = { path = "../apify-shared", editable = true }
|
||||||
|
```
|
||||||
|
|
||||||
|
Then run `uv sync` in the consuming project.
|
||||||
|
|||||||
17
pyproject.toml
Normal file
17
pyproject.toml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
[project]
|
||||||
|
name = "apify-shared"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Shared utility package for use with Apify actors"
|
||||||
|
authors = [{ name = "Ryan Byrne", email = "ryanjbyrne30@gmail.com" }]
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
dependencies = [
|
||||||
|
"boto3>=1.35",
|
||||||
|
"httpx>=0.28.1",
|
||||||
|
"tenacity>=9.1.4",
|
||||||
|
"zenrows>=1.0",
|
||||||
|
"apify>=2.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[build-system]
|
||||||
|
requires = ["uv_build>=0.9.18,<0.10.0"]
|
||||||
|
build-backend = "uv_build"
|
||||||
2
src/apify_shared/__init__.py
Normal file
2
src/apify_shared/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
def hello() -> str:
|
||||||
|
return "Hello from apify-shared!"
|
||||||
86
src/apify_shared/data_store.py
Normal file
86
src/apify_shared/data_store.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
import asyncio
|
||||||
|
import csv
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import boto3
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DataStoreConfig:
|
||||||
|
bucket: str
|
||||||
|
aws_access_key_id: str | None = None
|
||||||
|
aws_secret_access_key: str | None = None
|
||||||
|
aws_region: str = "us-east-1"
|
||||||
|
|
||||||
|
|
||||||
|
class DataStore:
|
||||||
|
def __init__(self, config: DataStoreConfig):
|
||||||
|
self.config = config
|
||||||
|
kwargs: dict = {"region_name": config.aws_region}
|
||||||
|
if config.aws_access_key_id and config.aws_secret_access_key:
|
||||||
|
kwargs["aws_access_key_id"] = config.aws_access_key_id
|
||||||
|
kwargs["aws_secret_access_key"] = config.aws_secret_access_key
|
||||||
|
self._client = boto3.client("s3", **kwargs)
|
||||||
|
|
||||||
|
async def upload_json(self, key: str, data: Any):
|
||||||
|
body = json.dumps(data, indent=2).encode("utf-8")
|
||||||
|
await self.__upload(key, body, "application/json")
|
||||||
|
|
||||||
|
async def upload_xml(self, key: str, data: Any):
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
body = data
|
||||||
|
elif isinstance(data, str):
|
||||||
|
body = data.encode("utf-8")
|
||||||
|
else:
|
||||||
|
body = str(data).encode("utf-8")
|
||||||
|
await self.__upload(key, body, "application/xml")
|
||||||
|
|
||||||
|
async def upload_html(
|
||||||
|
self, key: str, data: str | bytes, metadata: dict[str, object] = {}
|
||||||
|
):
|
||||||
|
body = data.encode("utf-8") if isinstance(data, str) else data
|
||||||
|
await self.__upload(key, body, "text/html; charset=utf-8", metadata=metadata)
|
||||||
|
|
||||||
|
async def upload_txt(self, key: str, data: Any):
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
body = data
|
||||||
|
else:
|
||||||
|
body = str(data).encode("utf-8")
|
||||||
|
await self.__upload(key, body, "text/plain")
|
||||||
|
|
||||||
|
async def upload_csv(self, key: str, data: Any):
|
||||||
|
if isinstance(data, bytes):
|
||||||
|
body = data
|
||||||
|
elif isinstance(data, str):
|
||||||
|
body = data.encode("utf-8")
|
||||||
|
elif isinstance(data, list):
|
||||||
|
buf = io.StringIO()
|
||||||
|
if data and isinstance(data[0], dict):
|
||||||
|
writer = csv.DictWriter(buf, fieldnames=data[0].keys())
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(data)
|
||||||
|
else:
|
||||||
|
writer = csv.writer(buf)
|
||||||
|
writer.writerows(data)
|
||||||
|
body = buf.getvalue().encode("utf-8")
|
||||||
|
else:
|
||||||
|
raise TypeError(f"Unsupported data type for CSV upload: {type(data)}")
|
||||||
|
await self.__upload(key, body, "text/csv")
|
||||||
|
|
||||||
|
async def __upload(
|
||||||
|
self, key: str, body: bytes, content_type: str, metadata: dict[str, object] = {}
|
||||||
|
):
|
||||||
|
await asyncio.to_thread(
|
||||||
|
self._client.put_object,
|
||||||
|
Bucket=self.config.bucket,
|
||||||
|
Key=key,
|
||||||
|
Body=body,
|
||||||
|
ContentType=content_type,
|
||||||
|
Metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
def hash(self, content: str):
|
||||||
|
return hashlib.md5(content.encode()).hexdigest()
|
||||||
66
src/apify_shared/fetch.py
Normal file
66
src/apify_shared/fetch.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from tenacity import (
|
||||||
|
AsyncRetrying,
|
||||||
|
before_sleep_log,
|
||||||
|
retry_if_exception,
|
||||||
|
stop_after_attempt,
|
||||||
|
wait_exponential,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_retryable(exc: BaseException) -> bool:
|
||||||
|
if isinstance(exc, httpx.TimeoutException):
|
||||||
|
return True
|
||||||
|
if isinstance(exc, httpx.HTTPStatusError):
|
||||||
|
return exc.response.status_code in (429, 500, 502, 503, 504)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FetchConfig:
|
||||||
|
timeout: float = 30.0
|
||||||
|
max_attempts: int = 3
|
||||||
|
backoff_min: float = 2.0
|
||||||
|
backoff_max: float = 30.0
|
||||||
|
|
||||||
|
|
||||||
|
class FetchClient:
|
||||||
|
def __init__(self, config: FetchConfig | None = None):
|
||||||
|
self._config = config or FetchConfig()
|
||||||
|
self._client: httpx.AsyncClient | None = None
|
||||||
|
|
||||||
|
async def __aenter__(self) -> "FetchClient":
|
||||||
|
self._client = await httpx.AsyncClient(
|
||||||
|
timeout=self._config.timeout
|
||||||
|
).__aenter__()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, *args) -> None:
|
||||||
|
if self._client:
|
||||||
|
await self._client.__aexit__(*args)
|
||||||
|
|
||||||
|
async def get(self, url: str, **kwargs) -> httpx.Response:
|
||||||
|
return await self._request("GET", url, **kwargs)
|
||||||
|
|
||||||
|
async def post(self, url: str, **kwargs) -> httpx.Response:
|
||||||
|
return await self._request("POST", url, **kwargs)
|
||||||
|
|
||||||
|
async def _request(self, method: str, url: str, **kwargs) -> httpx.Response:
|
||||||
|
assert self._client is not None, "FetchClient must be used as a context manager"
|
||||||
|
cfg = self._config
|
||||||
|
async for attempt in AsyncRetrying(
|
||||||
|
retry=retry_if_exception(_is_retryable),
|
||||||
|
stop=stop_after_attempt(cfg.max_attempts),
|
||||||
|
wait=wait_exponential(min=cfg.backoff_min, max=cfg.backoff_max),
|
||||||
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
|
reraise=True,
|
||||||
|
):
|
||||||
|
with attempt:
|
||||||
|
response = await self._client.request(method, url, **kwargs)
|
||||||
|
response.raise_for_status()
|
||||||
|
return response
|
||||||
18
src/apify_shared/logger.py
Normal file
18
src/apify_shared/logger.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from apify import Actor
|
||||||
|
|
||||||
|
|
||||||
|
class Logger:
|
||||||
|
def info(self, msg: object, *args: object):
|
||||||
|
Actor.log.info(msg, args)
|
||||||
|
|
||||||
|
def debug(self, msg: object, *args: object):
|
||||||
|
Actor.log.debug(msg, args)
|
||||||
|
|
||||||
|
def error(self, msg: object, *args: object):
|
||||||
|
Actor.log.error(msg, args)
|
||||||
|
|
||||||
|
def fatal(self, msg: object, *args: object):
|
||||||
|
Actor.log.fatal(msg, args)
|
||||||
|
|
||||||
|
def warn(self, msg: object, *args: object):
|
||||||
|
Actor.log.warning(msg, args)
|
||||||
0
src/apify_shared/py.typed
Normal file
0
src/apify_shared/py.typed
Normal file
10
src/apify_shared/utils.py
Normal file
10
src/apify_shared/utils.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
from datetime import datetime, timezone
|
||||||
|
|
||||||
|
|
||||||
|
def timestamp_str():
|
||||||
|
return datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||||
|
|
||||||
|
|
||||||
|
def date_str(iso_date: str) -> str:
|
||||||
|
"""Normalize an ISO 8601 date/datetime string to YYYYMMDD for use in S3 keys."""
|
||||||
|
return iso_date[:10].replace("-", "")
|
||||||
19
src/apify_shared/zenrows.py
Normal file
19
src/apify_shared/zenrows.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
from virtek_apify.logger import Logger
|
||||||
|
import time
|
||||||
|
from zenrows import ZenRowsClient
|
||||||
|
|
||||||
|
|
||||||
|
class ZenRowsScraper:
|
||||||
|
def __init__(self, logger: Logger, api_key: str):
|
||||||
|
self.client = ZenRowsClient(api_key)
|
||||||
|
self.logger = logger
|
||||||
|
|
||||||
|
def scrape(self, url: str):
|
||||||
|
self.logger.debug(f"ZenRows scraping url: {url}...")
|
||||||
|
start = time.monotonic()
|
||||||
|
response = self.client.get(url, params={"mode": "auto"})
|
||||||
|
elapsed_time = round((time.monotonic() - start) * 1000)
|
||||||
|
self.logger.debug(
|
||||||
|
f"ZenRows scraped url: {url}. Returned status: {response.status_code} in {elapsed_time}ms"
|
||||||
|
)
|
||||||
|
return response
|
||||||
Reference in New Issue
Block a user