diff --git a/pyproject.toml b/pyproject.toml index aad4600..d71730c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "vl-apify-shared" -version = "0.1.1" +version = "0.1.2" description = "Shared utility package for use with Apify actors" authors = [{ name = "Ryan Byrne", email = "ryanjbyrne30@gmail.com" }] requires-python = ">=3.12" diff --git a/src/vl_apify_shared/zenrows.py b/src/vl_apify_shared/zenrows.py index d31f5c4..c9af841 100644 --- a/src/vl_apify_shared/zenrows.py +++ b/src/vl_apify_shared/zenrows.py @@ -1,7 +1,8 @@ import json import time -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Optional + from vl_apify_shared.logger import Logger from zenrows import ZenRowsClient @@ -12,6 +13,28 @@ class ZenRowsConfig: api_key: str +def _build_params( + *, + js_render: bool = False, + wait: Optional[int] = None, + wait_for: Optional[str] = None, + premium_proxy: bool = False, + js_instructions: Optional[list] = None, +) -> dict: + params: dict = {} + if js_render: + params["js_render"] = "true" + if premium_proxy: + params["premium_proxy"] = "true" + if wait is not None: + params["wait"] = str(wait) + if wait_for is not None: + params["wait_for"] = wait_for + if js_instructions is not None: + params["js_instructions"] = json.dumps(js_instructions) + return params + + class ZenRowsScraper: def __init__(self, config: ZenRowsConfig): self.client = ZenRowsClient(config.api_key) @@ -37,17 +60,13 @@ class ZenRowsScraper: js_instructions: list of ZenRows instruction dicts (evaluate, wait_for, click, etc.) """ self.logger.debug(f"ZenRows scraping url: {url}...") - params: dict = {} - if js_render: - params["js_render"] = "true" - if premium_proxy: - params["premium_proxy"] = "true" - if wait is not None: - params["wait"] = str(wait) - if wait_for is not None: - params["wait_for"] = wait_for - if js_instructions is not None: - params["js_instructions"] = json.dumps(js_instructions) + params = _build_params( + js_render=js_render, + wait=wait, + wait_for=wait_for, + premium_proxy=premium_proxy, + js_instructions=js_instructions, + ) start = time.monotonic() response = self.client.get(url, params=params) elapsed_time = round((time.monotonic() - start) * 1000) @@ -56,17 +75,33 @@ class ZenRowsScraper: ) return response - async def scrape_async(self, url: str, premium_proxy: bool = False): - """Fetch a URL asynchronously through ZenRows as a plain proxy (no JS rendering). + async def scrape_async( + self, + url: str, + js_render: bool = False, + wait: Optional[int] = None, + wait_for: Optional[str] = None, + premium_proxy: bool = False, + js_instructions: Optional[list] = None, + ): + """Fetch a URL asynchronously through ZenRows. Args url: URL to fetch + js_render: whether to wait for JavaScript to execute + wait: milliseconds to wait after page load + wait_for: css selector to wait for premium_proxy: use residential proxy (ZenRows premium_proxy=true) + js_instructions: list of ZenRows instruction dicts (evaluate, wait_for, click, etc.) """ self.logger.debug(f"ZenRows fetching url: {url}...") - params: dict = {} - if premium_proxy: - params["premium_proxy"] = "true" + params = _build_params( + js_render=js_render, + wait=wait, + wait_for=wait_for, + premium_proxy=premium_proxy, + js_instructions=js_instructions, + ) start = time.monotonic() response = await self.client.get_async(url, params=params) elapsed_time = round((time.monotonic() - start) * 1000)