scrape_async with proxy and js support
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
[project]
|
[project]
|
||||||
name = "vl-apify-shared"
|
name = "vl-apify-shared"
|
||||||
version = "0.1.1"
|
version = "0.1.2"
|
||||||
description = "Shared utility package for use with Apify actors"
|
description = "Shared utility package for use with Apify actors"
|
||||||
authors = [{ name = "Ryan Byrne", email = "ryanjbyrne30@gmail.com" }]
|
authors = [{ name = "Ryan Byrne", email = "ryanjbyrne30@gmail.com" }]
|
||||||
requires-python = ">=3.12"
|
requires-python = ">=3.12"
|
||||||
|
|||||||
@@ -1,7 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from vl_apify_shared.logger import Logger
|
from vl_apify_shared.logger import Logger
|
||||||
from zenrows import ZenRowsClient
|
from zenrows import ZenRowsClient
|
||||||
|
|
||||||
@@ -12,6 +13,28 @@ class ZenRowsConfig:
|
|||||||
api_key: str
|
api_key: str
|
||||||
|
|
||||||
|
|
||||||
|
def _build_params(
|
||||||
|
*,
|
||||||
|
js_render: bool = False,
|
||||||
|
wait: Optional[int] = None,
|
||||||
|
wait_for: Optional[str] = None,
|
||||||
|
premium_proxy: bool = False,
|
||||||
|
js_instructions: Optional[list] = None,
|
||||||
|
) -> dict:
|
||||||
|
params: dict = {}
|
||||||
|
if js_render:
|
||||||
|
params["js_render"] = "true"
|
||||||
|
if premium_proxy:
|
||||||
|
params["premium_proxy"] = "true"
|
||||||
|
if wait is not None:
|
||||||
|
params["wait"] = str(wait)
|
||||||
|
if wait_for is not None:
|
||||||
|
params["wait_for"] = wait_for
|
||||||
|
if js_instructions is not None:
|
||||||
|
params["js_instructions"] = json.dumps(js_instructions)
|
||||||
|
return params
|
||||||
|
|
||||||
|
|
||||||
class ZenRowsScraper:
|
class ZenRowsScraper:
|
||||||
def __init__(self, config: ZenRowsConfig):
|
def __init__(self, config: ZenRowsConfig):
|
||||||
self.client = ZenRowsClient(config.api_key)
|
self.client = ZenRowsClient(config.api_key)
|
||||||
@@ -37,17 +60,13 @@ class ZenRowsScraper:
|
|||||||
js_instructions: list of ZenRows instruction dicts (evaluate, wait_for, click, etc.)
|
js_instructions: list of ZenRows instruction dicts (evaluate, wait_for, click, etc.)
|
||||||
"""
|
"""
|
||||||
self.logger.debug(f"ZenRows scraping url: {url}...")
|
self.logger.debug(f"ZenRows scraping url: {url}...")
|
||||||
params: dict = {}
|
params = _build_params(
|
||||||
if js_render:
|
js_render=js_render,
|
||||||
params["js_render"] = "true"
|
wait=wait,
|
||||||
if premium_proxy:
|
wait_for=wait_for,
|
||||||
params["premium_proxy"] = "true"
|
premium_proxy=premium_proxy,
|
||||||
if wait is not None:
|
js_instructions=js_instructions,
|
||||||
params["wait"] = str(wait)
|
)
|
||||||
if wait_for is not None:
|
|
||||||
params["wait_for"] = wait_for
|
|
||||||
if js_instructions is not None:
|
|
||||||
params["js_instructions"] = json.dumps(js_instructions)
|
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
response = self.client.get(url, params=params)
|
response = self.client.get(url, params=params)
|
||||||
elapsed_time = round((time.monotonic() - start) * 1000)
|
elapsed_time = round((time.monotonic() - start) * 1000)
|
||||||
@@ -56,17 +75,33 @@ class ZenRowsScraper:
|
|||||||
)
|
)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
async def scrape_async(self, url: str, premium_proxy: bool = False):
|
async def scrape_async(
|
||||||
"""Fetch a URL asynchronously through ZenRows as a plain proxy (no JS rendering).
|
self,
|
||||||
|
url: str,
|
||||||
|
js_render: bool = False,
|
||||||
|
wait: Optional[int] = None,
|
||||||
|
wait_for: Optional[str] = None,
|
||||||
|
premium_proxy: bool = False,
|
||||||
|
js_instructions: Optional[list] = None,
|
||||||
|
):
|
||||||
|
"""Fetch a URL asynchronously through ZenRows.
|
||||||
|
|
||||||
Args
|
Args
|
||||||
url: URL to fetch
|
url: URL to fetch
|
||||||
|
js_render: whether to wait for JavaScript to execute
|
||||||
|
wait: milliseconds to wait after page load
|
||||||
|
wait_for: css selector to wait for
|
||||||
premium_proxy: use residential proxy (ZenRows premium_proxy=true)
|
premium_proxy: use residential proxy (ZenRows premium_proxy=true)
|
||||||
|
js_instructions: list of ZenRows instruction dicts (evaluate, wait_for, click, etc.)
|
||||||
"""
|
"""
|
||||||
self.logger.debug(f"ZenRows fetching url: {url}...")
|
self.logger.debug(f"ZenRows fetching url: {url}...")
|
||||||
params: dict = {}
|
params = _build_params(
|
||||||
if premium_proxy:
|
js_render=js_render,
|
||||||
params["premium_proxy"] = "true"
|
wait=wait,
|
||||||
|
wait_for=wait_for,
|
||||||
|
premium_proxy=premium_proxy,
|
||||||
|
js_instructions=js_instructions,
|
||||||
|
)
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
response = await self.client.get_async(url, params=params)
|
response = await self.client.get_async(url, params=params)
|
||||||
elapsed_time = round((time.monotonic() - start) * 1000)
|
elapsed_time = round((time.monotonic() - start) * 1000)
|
||||||
|
|||||||
Reference in New Issue
Block a user