scrape_async with proxy and js support

This commit is contained in:
2026-06-08 08:56:42 -07:00
parent 0cb2de58c5
commit 5b5d4ab8f6
2 changed files with 53 additions and 18 deletions

View File

@@ -1,6 +1,6 @@
[project] [project]
name = "vl-apify-shared" name = "vl-apify-shared"
version = "0.1.1" version = "0.1.2"
description = "Shared utility package for use with Apify actors" description = "Shared utility package for use with Apify actors"
authors = [{ name = "Ryan Byrne", email = "ryanjbyrne30@gmail.com" }] authors = [{ name = "Ryan Byrne", email = "ryanjbyrne30@gmail.com" }]
requires-python = ">=3.12" requires-python = ">=3.12"

View File

@@ -1,7 +1,8 @@
import json import json
import time import time
from dataclasses import dataclass, field from dataclasses import dataclass
from typing import Optional from typing import Optional
from vl_apify_shared.logger import Logger from vl_apify_shared.logger import Logger
from zenrows import ZenRowsClient from zenrows import ZenRowsClient
@@ -12,6 +13,28 @@ class ZenRowsConfig:
api_key: str api_key: str
def _build_params(
*,
js_render: bool = False,
wait: Optional[int] = None,
wait_for: Optional[str] = None,
premium_proxy: bool = False,
js_instructions: Optional[list] = None,
) -> dict:
params: dict = {}
if js_render:
params["js_render"] = "true"
if premium_proxy:
params["premium_proxy"] = "true"
if wait is not None:
params["wait"] = str(wait)
if wait_for is not None:
params["wait_for"] = wait_for
if js_instructions is not None:
params["js_instructions"] = json.dumps(js_instructions)
return params
class ZenRowsScraper: class ZenRowsScraper:
def __init__(self, config: ZenRowsConfig): def __init__(self, config: ZenRowsConfig):
self.client = ZenRowsClient(config.api_key) self.client = ZenRowsClient(config.api_key)
@@ -37,17 +60,13 @@ class ZenRowsScraper:
js_instructions: list of ZenRows instruction dicts (evaluate, wait_for, click, etc.) js_instructions: list of ZenRows instruction dicts (evaluate, wait_for, click, etc.)
""" """
self.logger.debug(f"ZenRows scraping url: {url}...") self.logger.debug(f"ZenRows scraping url: {url}...")
params: dict = {} params = _build_params(
if js_render: js_render=js_render,
params["js_render"] = "true" wait=wait,
if premium_proxy: wait_for=wait_for,
params["premium_proxy"] = "true" premium_proxy=premium_proxy,
if wait is not None: js_instructions=js_instructions,
params["wait"] = str(wait) )
if wait_for is not None:
params["wait_for"] = wait_for
if js_instructions is not None:
params["js_instructions"] = json.dumps(js_instructions)
start = time.monotonic() start = time.monotonic()
response = self.client.get(url, params=params) response = self.client.get(url, params=params)
elapsed_time = round((time.monotonic() - start) * 1000) elapsed_time = round((time.monotonic() - start) * 1000)
@@ -56,17 +75,33 @@ class ZenRowsScraper:
) )
return response return response
async def scrape_async(self, url: str, premium_proxy: bool = False): async def scrape_async(
"""Fetch a URL asynchronously through ZenRows as a plain proxy (no JS rendering). self,
url: str,
js_render: bool = False,
wait: Optional[int] = None,
wait_for: Optional[str] = None,
premium_proxy: bool = False,
js_instructions: Optional[list] = None,
):
"""Fetch a URL asynchronously through ZenRows.
Args Args
url: URL to fetch url: URL to fetch
js_render: whether to wait for JavaScript to execute
wait: milliseconds to wait after page load
wait_for: css selector to wait for
premium_proxy: use residential proxy (ZenRows premium_proxy=true) premium_proxy: use residential proxy (ZenRows premium_proxy=true)
js_instructions: list of ZenRows instruction dicts (evaluate, wait_for, click, etc.)
""" """
self.logger.debug(f"ZenRows fetching url: {url}...") self.logger.debug(f"ZenRows fetching url: {url}...")
params: dict = {} params = _build_params(
if premium_proxy: js_render=js_render,
params["premium_proxy"] = "true" wait=wait,
wait_for=wait_for,
premium_proxy=premium_proxy,
js_instructions=js_instructions,
)
start = time.monotonic() start = time.monotonic()
response = await self.client.get_async(url, params=params) response = await self.client.get_async(url, params=params)
elapsed_time = round((time.monotonic() - start) * 1000) elapsed_time = round((time.monotonic() - start) * 1000)