### Full Example of Request Queue Usage Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/03_storages.mdx Demonstrates the complete workflow of using a request queue, including adding requests, fetching them for processing, and marking them as handled. Ensure you have the necessary Apify client setup. ```python from apify_client import ApifyClient # Initialize the ApifyClient # If you don't have APIFY_API_TOKEN set in your environment, # you can pass it explicitly: ApifyClient(token='YOUR_API_TOKEN') client = ApifyClient() # Get a request queue client request_queue = client.request_queue('YOUR_REQUEST_QUEUE_ID') # Add requests to the queue await request_queue.add_request({ 'url': 'http://example.com/page1', 'method': 'GET', 'payload': 'some data', 'headers': {'X-My-Header': 'my-value'}, 'unique_key': 'http://example.com/page1', 'forefront': True, }) await request_queue.add_request({ 'url': 'http://example.com/page2', 'method': 'POST', 'payload': 'another data', }) # Fetch requests from the queue while True: request = await request_queue.fetch_next_request() if request is None: # All requests are processed break print(f"Processing request: {request['url']}") # Process the request (e.g., download content, extract data) # ... # Mark the request as handled await request_queue.mark_request_as_handled(request) # If you want to retry the request later, use reclaim_request: # await request_queue.reclaim_request(request) # Check if the queue is finished finished = await request_queue.is_finished() print(f"Request queue finished: {finished}") ``` -------------------------------- ### Install Development Dependencies Source: https://github.com/apify/apify-sdk-python/blob/master/CONTRIBUTING.md Installs all necessary dependencies for local development. Ensure uv is installed and configured. ```sh uv run poe install-dev ``` -------------------------------- ### Full Example of Request Queue Operations in Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/03_storages.mdx Demonstrates adding requests to a queue, fetching them for processing, and marking them as handled. Ensure you have the Apify SDK installed. ```python from apify_client import ApifyClient from apify import ApifyClient async def main(): # You can use the ApifyClient with an API token or without one. # If you don't provide an API token, the client will use the default environment variable APIFY_API_TOKEN. # If you don't have an API token set, the client will default to the anonymous access role. # client = ApifyClient("YOUR_API_TOKEN") client = ApifyClient() # Get a request queue client request_queue = await client.request_queue("my-request-queue") # Add requests to the queue await request_queue.add_request({ "url": "https://example.com/page1", "method": "GET", "unique_key": "https://example.com/page1", "payload": "some data", }) await request_queue.add_request({ "url": "https://example.com/page2", "method": "POST", "unique_key": "https://example.com/page2", "payload": { "key": "value" }, "forefront": True, # Add to the beginning of the queue }) # Fetch the next request from the queue request = await request_queue.fetch_next_request() if request: print(f"Processing request: {request['url']}") # Process the request here... # Mark the request as handled await request_queue.mark_request_as_handled(request) print("Request marked as handled.") else: print("No requests in the queue.") # Check if the queue is finished (all requests handled) is_finished = await request_queue.is_finished() print(f"Is queue finished? {is_finished}") # Reclaim a request (mark as not handled, so it gets retried) # await request_queue.reclaim_request(request['id']) # print("Request reclaimed.") # Get info about a specific request # request_info = await request_queue.get_request("some-request-id") # print(f"Request info: {request_info}") if __name__ == "__main__": import asyncio asyncio.run(main()) ``` -------------------------------- ### Start a simple web server in Python Actor Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/07_running_webserver.mdx This Python code starts a basic HTTP server that responds to GET requests with the current processed item count. Ensure the server listens on the port provided by `Actor.configuration.container_port`. ```python import os import logging from http.server import BaseHTTPRequestHandler, HTTPServer from apify import Actor logging.basicConfig(level=logging.INFO) class RequestHandler(BaseHTTPRequestHandler): def do_GET(self): if self.path == '/': self.send_response(200) self.send_header('Content-type', 'text/plain') self.end_headers() self.wfile.write(f'Processed items: {Actor.get_context().user_data.get("processed_items", 0)}'.encode('utf-8')) else: self.send_error(404, 'Not Found') async def main(): async with Actor( # Set the Actor's main function # This is where your Actor's logic will be executed # For more information, see https://docs.apify.com/sdk/python/docs/guides/getting-started#your-first-actor name='webserver-example', version='0.1.0', ) as actor: # Start the web server port = actor.config.container_port server_address = ('', port) httpd = HTTPServer(server_address, RequestHandler) actor.log.info(f'Web server listening on port {port}') # Simulate processing items for i in range(10): actor.log.info(f'Processing item {i+1}') actor.user_data.processed_items = i + 1 await actor.sleep(1) # Keep the server running until the Actor stops # In a real Actor, you would typically have a loop here that performs tasks # and the web server would run in the background. # For this example, we'll just keep it running for a bit. try: httpd.serve_forever() except KeyboardInterrupt: pass finally: httpd.server_close() actor.log.info('Web server stopped.') if __name__ == '__main__': Actor.run(main()) ``` -------------------------------- ### Install Dependencies Source: https://github.com/apify/apify-sdk-python/blob/master/CLAUDE.md Installs all project dependencies, including development ones. ```bash uv sync --all-extras ``` -------------------------------- ### Proxy Rotation Example Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/05_proxy_management.mdx Demonstrates IP rotation and session management using ProxyConfiguration.new_url() with session IDs. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_APIFY_API_TOKEN") proxy_configuration = client.actor("apify/proxy-management").create_proxy_configuration( use_apify_proxy=True, ) # Use session ID for consistent proxy assignment session_id = "my-session-123" proxy_url_with_session = proxy_configuration.new_url(session_id=session_id) print(f"Proxy URL with session ID: {proxy_url_with_session}") # Subsequent calls with the same session ID return the same proxy URL proxy_url_with_session_again = proxy_configuration.new_url(session_id=session_id) print(f"Proxy URL with session ID again: {proxy_url_with_session_again}") # Without session ID, proxies are rotated proxy_url_rotated = proxy_configuration.new_url() print(f"Rotated proxy URL: {proxy_url_rotated}") ``` -------------------------------- ### Use Custom Proxies Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/05_proxy_management.mdx Example of configuring and using your own custom proxy servers with the ProxyConfiguration class. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_APIFY_API_TOKEN") # Use custom proxies proxy_configuration = client.actor("apify/proxy-management").create_proxy_configuration( proxy_urls=["http://user:password@custom-proxy.com:8080"] ) print(f"Proxy URL: {proxy_configuration.new_url()}") ``` -------------------------------- ### Install Apify SDK Source: https://context7.com/apify/apify-sdk-python/llms.txt Install the SDK using pip. Use `apify[scrapy]` for Scrapy integration. ```bash pip install apify ``` ```bash pip install apify[scrapy] ``` -------------------------------- ### Install Apify SDK for Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/01_introduction/index.mdx Use this command to manually install the Apify SDK for Python in an existing project. Requires Python 3.10 or above. ```bash pip install apify ``` -------------------------------- ### Install Apify SDK with Scrapy Extra Source: https://github.com/apify/apify-sdk-python/blob/master/README.md Install the Apify SDK with the 'scrapy' extra for integration with Scrapy projects. ```bash pip install apify[scrapy] ``` -------------------------------- ### Run Documentation Locally Source: https://github.com/apify/apify-sdk-python/blob/master/CONTRIBUTING.md Builds and serves the documentation website locally using Docusaurus. Requires Node.js to be installed. ```sh uv run poe run-docs ``` -------------------------------- ### Open and use default storages Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/03_storages.mdx Demonstrates how to open and interact with the default dataset, key-value store, and request queue for an Actor. Ensure you have the Apify SDK installed. ```python from apify import Actor # Get Actor instance actor = Actor() # Open default storages dataset = await actor.open_dataset() key_value_store = await actor.open_key_value_store() request_queue = await actor.open_request_queue() # Example: Save data to dataset await dataset.push_items([{"key": "value"}]) # Example: Save data to key-value store await key_value_store.set_value("my-key", "my-value") # Example: Add URL to request queue await request_queue.add_request({"url": "https://example.com"}) ``` -------------------------------- ### Use Custom Proxies Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/05_proxy_management.mdx Example of configuring and using your own custom proxy servers with the Apify SDK. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_APIFY_API_TOKEN") # Use your own proxy servers run_input = { "proxy": { "proxyUrls": [ "http://user:password@proxy.example.com:8080", "http://user:password@proxy2.example.com:8081" ] } } client.actor("some/actor").call(run_input=run_input) ``` -------------------------------- ### Use Proxy Configuration with HTTPX Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/05_proxy_management.mdx Integrate the generated proxy configuration with the `httpx` library by passing it to the `proxies` argument. Ensure `httpx` is installed (`pip install httpx`). ```python import httpx from apify_client import ApifyClient, ProxyConfiguration # Assume proxy_config is an instance of ProxyConfiguration # For example, obtained from Actor input or direct configuration # proxy_config = ProxyConfiguration(use_apify_proxy=True, apify_proxy_options={'groups': ['RESIDENTIAL']}) # If proxy_config is None, no proxy will be used proxy_url = None if proxy_config: proxy_url = proxy_config.proxy_url # This will be None if use_apify_proxy is True # httpx requires a dictionary for the proxies argument httpx_proxies = { "http://": proxy_url, "https://": proxy_url, } # If using Apify Proxy, httpx needs to be configured differently # The ApifyClient handles this internally, but for direct httpx usage: # You might need to construct the proxy URL manually if use_apify_proxy is True # and ApifyClient is not used to make the request. # Example using a direct proxy URL (if not using Apify Proxy directly via ApifyClient) # direct_proxy_url = "http://user:password@proxy.example.com:8080" # httpx_proxies = { # "http://": direct_proxy_url, # "https://": direct_proxy_url, # } # client = httpx.Client(proxies=httpx_proxies) # response = client.get("https://api.apify.com") # print(response.text) ``` -------------------------------- ### Iterate Keys in a Key-Value Store in Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/03_storages.mdx Provides an example of iterating through all record keys in a key-value store. ```python from apify_client import ApifyClient # Iterate over keys in the default key-value store async for key in Actor.iterate_keys(): print(key) ``` -------------------------------- ### Use Proxy with HTTPX Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/05_proxy_management.mdx Integrate generated proxy URLs with the `httpx` library by passing the `ProxyConfiguration` object to the `proxies` argument of the `httpx.Client` or `httpx.AsyncClient`. Ensure `httpx` is installed (`pip install httpx`). ```python import httpx from apify_client import ApifyClient, ProxyConfiguration # Assume proxy_config is an instance of ProxyConfiguration # For example, obtained from Actor input or created directly # proxy_config = ProxyConfiguration(use_apify_proxy=True) # Create an httpx client with proxy configuration async with httpx.AsyncClient(proxies=proxy_config.to_dict()) as client: response = await client.get("https://httpbin.org/ip") print(response.json()) ``` -------------------------------- ### Start another Actor Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/06_interacting_with_other_actors.mdx Use `Actor.start` to initiate another Actor on the Apify platform. This method returns immediately with the details of the started Actor run. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_API_TOKEN") # Start an Actor run and get its details immediately run_info = client.actor("apify/hello-world").start() print(f"Started Actor run: {run_info['id']}") ``` -------------------------------- ### Install Playwright Browsers (Windows) Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/03_playwright.mdx Execute this command in PowerShell to install Playwright browsers and their dependencies on Windows. Make sure your virtual environment is activated. ```powershell .venv\Scripts\activate playwright install --with-deps ``` -------------------------------- ### Basic Actor.charge example Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/11_pay_per_event.mdx Use `Actor.charge` to charge users for events. The SDK automatically tracks the maximum charge limit. ```python from apify import Actor # Example of charging for an event result = await Actor.charge(1.5, 'Custom event description') # You can also push data and charge at the same time await Actor.push_data({'my_data': 'some_value'}, charge=1.0) ``` -------------------------------- ### Configure Actor State Persistence - Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/10_configuration.mdx Use the Configuration class to set actor options. This example sets the state persistence to every 10 seconds. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_APIFY_TOKEN") # Example of how to set configuration options configuration = client.configuration configuration.set_value("persist_storage", 10) print(f"Persist storage every {configuration.get_value('persist_storage')} seconds") ``` -------------------------------- ### Install Playwright Browsers (Linux/macOS) Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/03_playwright.mdx Run this command in your terminal to install Playwright browsers and their dependencies on Linux or macOS. Ensure your virtual environment is activated. ```bash source .venv/bin/activate playwright install --with-deps ``` -------------------------------- ### Request Queue Example - Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/03_storages.mdx Demonstrates basic usage of a request queue, including adding URLs and processing them. Request queues are essential for managing crawling tasks. ```python from apify import Actor request_queue = await Actor.open_request_queue() # Add URLs to the request queue await request_queue.add_request({'url': 'https://example.com/page1', 'method': 'GET'}) await request_queue.add_request({'url': 'https://example.com/page2', 'method': 'GET'}) # Process requests from the queue while True: request = await request_queue.get_request() # Returns None if queue is empty if request is None: break print(f"Processing URL: {request['url']}") # Add your scraping logic here await request_queue.mark_request_as_handled(request) print('Finished processing requests.') ``` -------------------------------- ### Python Custom Proxy Configuration Source: https://context7.com/apify/apify-sdk-python/llms.txt Illustrates how to configure and use custom proxy servers with the Apify SDK. This example shows setting up round-robin rotation among a list of provided proxy URLs. ```python import asyncio from apify import Actor async def main() -> None: async with Actor: # Custom proxy URLs with round-robin rotation proxy_cfg = await Actor.create_proxy_configuration( proxy_urls=[ 'http://proxy1.example.com:8000', 'http://proxy2.example.com:8000', ] ) # Get URLs - rotates through the list url1 = await proxy_cfg.new_url() url2 = await proxy_cfg.new_url() Actor.log.info(f'Proxy 1: {url1}') Actor.log.info(f'Proxy 2: {url2}') if __name__ == '__main__': asyncio.run(main()) ``` -------------------------------- ### Configure Custom Proxy Function Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/05_proxy_management.mdx Example of providing a function to dynamically generate custom proxy URLs. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_APIFY_API_TOKEN") # Function to generate proxy URLs def custom_proxy_function(session_id=None): # Replace with your actual proxy URL generation logic return f"http://user:password@proxy.example.com:8080?session_id={session_id}" if session_id else "http://user:password@proxy.example.com:8080" run_input = { "proxy": { "proxyFunction": custom_proxy_function } } client.actor("some/actor").call(run_input=run_input) ``` -------------------------------- ### Use Apify Proxy Locally Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/05_proxy_management.mdx Example of using Apify Proxy locally. Ensure you are running Actors via the Apify CLI and logged in. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_APIFY_API_TOKEN") # Use Apify Proxy for your Actor run run_input = { "proxy": { "useApifyProxy": True } } client.actor("some/actor").call(run_input=run_input) ``` -------------------------------- ### Example Scrapy Actor: __main__.py Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/06_scrapy.mdx The entry point for a Scrapy Actor. It initializes logging, applies Apify settings, and runs the Scrapy spider using the apify.scrapy.run_scrapy_actor function. ```python import logging from apify.log import initialize_logging from apify.scrapy import apply_apify_settings, run_scrapy_actor # Configure logging for the Actor environment initialize_logging(logging.INFO) # Apply Apify-specific settings to Scrapy apply_apify_settings() # Run the Scrapy spider if __name__ == "__main__": run_scrapy_actor() ``` -------------------------------- ### Basic Actor Charge Example Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/11_pay_per_event.mdx Use `Actor.charge` to charge users for specific events. The SDK automatically tracks the maximum charge limit. ```python from apify_client import ApifyClient # Initialize the ApifyClient client = ApifyClient("YOUR_APIFY_API_TOKEN") # Get the Actor client actor_client = client.actor("ACTOR_ID") # Example of charging for an event charge_result = actor_client.call(params={"event_type": "process_item", "charge_amount": 100}) if charge_result.get("status") == "SUCCEEDED": print("Charge successful!") else: print("Charge failed.") ``` -------------------------------- ### Start another Actor Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/06_interacting_with_other_actors.mdx Use `Actor.start` to initiate another Actor on the Apify platform. This method returns immediately with the details of the new Actor run. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_APIFY_API_TOKEN") # Start an Actor run started_run_info = client.actor("apify/hello-world").start() print(f"Started Actor run: {started_run_info['id']}") ``` -------------------------------- ### Python Scrapy Actor Example: __main__.py Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/03_guides/06_scrapy.mdx Main entry point for a Scrapy Actor. It initializes logging and runs the Scrapy spider using the Apify SDK. ```python import logging from apify.log import initialize_logging from apify.scrapy import run_scrapy_actor initialize_logging(logging.INFO) if __name__ == "__main__": run_scrapy_actor() ``` -------------------------------- ### Example Playwright Actor for Web Scraping Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/03_playwright.mdx This Python Actor uses Playwright to navigate websites, extract page titles, and follow links recursively. It requires Playwright and Apify SDK to be installed. The Actor starts scraping from URLs provided in its input. ```python import asyncio from apify_client import ApifyClient from playwright.sync_api import sync_playwright # ApifyClient is used to interact with the Apify platform, e.g. to get input. # You can also use it to push results to a dataset or key-value store. client = ApifyClient("YOUR_APIFY_API_TOKEN") # Fetch input for the Actor input_json = client.user("me").get_storage_client().key_value_store("default").get_record().get("json") # Default input if none is provided if input_json is None: input_json = { "startUrls": [{"url": "https://apify.com"}], "maxDepth": 2 } start_urls = input_json.get("startUrls") max_depth = input_json.get("maxDepth") def run(): with sync_playwright() as p: # Launch the browser browser = p.chromium.launch() page = browser.new_page() # Function to scrape a single page def scrape_page(url, depth): print(f"Scraping: {url} (Depth: {depth})") try: page.goto(url) title = page.title() print(f" Title: {title}") # Extract links if not at max depth if depth < max_depth: links = page.locator("a").all() for link in links: href = link.get_attribute("href") if href and href.startswith("http"): # Avoid re-scraping the same URL # In a real actor, you'd use a set or similar for visited URLs scrape_page(href, depth + 1) except Exception as e: print(f" Error scraping {url}: {e}") # Start scraping from the initial URLs for start_url_obj in start_urls: scrape_page(start_url_obj["url"], 0) browser.close() if __name__ == "__main__": run() ``` -------------------------------- ### Calling Other Actors with Actor.call() Source: https://context7.com/apify/apify-sdk-python/llms.txt Start and interact with other Actors on the Apify platform using `Actor.call()`. This example calls the `apify/screenshot-url` Actor and retrieves its output. ```python import asyncio from apify import Actor async def main() -> None: async with Actor: # Call another Actor and wait for it to finish actor_run = await Actor.call( actor_id='apify/screenshot-url', run_input={ 'urls': [{'url': 'https://www.apify.com/'}], 'delay': 1000, 'waitUntil': 'load', }, ) if actor_run is None: raise RuntimeError('Actor failed to start.') # Get the output from the Actor's dataset run_client = Actor.apify_client.run(actor_run.id) await run_client.wait_for_finish() dataset_client = run_client.dataset() item_list = await dataset_client.list_items() Actor.log.info(f'Actor output: {item_list.items}') if __name__ == '__main__': asyncio.run(main()) ``` -------------------------------- ### Python Scrapy Actor Example: spiders/title.py Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/03_guides/06_scrapy.mdx A Scrapy spider that scrapes page titles and enqueues links. It starts from a given URL and yields TitleItems. ```python import scrapy from apify.scrapy import to_apify_request from apify.utils import unique_id from ..items import TitleItem class TitleSpider(scrapy.Spider): name = "title" start_urls = ["https://apify.com"] def parse(self, response): item = TitleItem() item["title"] = response.css("title::text").get() yield item for href in response.css('a::attr(href)'): yield response.follow(href, self.parse) def _build_request(self, rule, link): # This method is used by the LinkExtractor to build requests # We use to_apify_request to ensure the request is compatible with Apify return to_apify_request(super()._build_request(rule, link)) def _requests_from_url(self, url): # This method is used by the LinkExtractor to build requests from a URL # We use to_apify_request to ensure the request is compatible with Apify return [to_apify_request(req) for req in super()._requests_from_url(url)] ``` -------------------------------- ### Open Default and Named Storages in Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/03_storages.mdx Demonstrates opening default dataset, key-value store, and request queue, as well as opening storages by ID or name. ```python from apify_client import ApifyClient # Example of opening default storages default_dataset = Actor.open_dataset() default_key_value_store = Actor.open_key_value_store() default_request_queue = Actor.open_request_queue() # Example of opening storages by ID or name named_dataset = Actor.open_dataset(dataset_id='my-dataset-id') named_key_value_store = Actor.open_key_value_store(key_value_store_name='my-kv-store') named_request_queue = Actor.open_request_queue(request_queue_id='my-rq-id') ``` -------------------------------- ### Example Scrapy Actor: spiders/title.py Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/06_scrapy.mdx A Scrapy spider that scrapes page titles and enqueues links. It starts with a given URL and follows links found on the page. ```python import scrapy from items import Page from apify.scrapy import to_apify_request class TitleSpider(scrapy.Spider): name = "title" start_urls = ["https://apify.com/"] def parse(self, response): # Create an item with the page URL and title yield Page(url=response.url, title=response.css("title::text").get()) # Enqueue links found on the page for href in response.css("a::attr(href)"): yield response.follow(href, self.parse) def _build_request(self, rule, callback=None): # Convert Scrapy Request to Apify Request return to_apify_request(super()._build_request(rule, callback)) ``` -------------------------------- ### Open Default Storages - Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/03_storages.mdx Demonstrates how to open the default dataset, key-value store, and request queue for an Actor. Ensure the Apify client is initialized. ```python from apify import Actor # Get default dataset, key-value store, and request queue dataset = await Actor.open_dataset() key_value_store = await Actor.open_key_value_store() request_queue = await Actor.open_request_queue() ``` -------------------------------- ### Python Selenium Actor Example Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/04_selenium.mdx A Python script demonstrating a simple Apify Actor that uses Selenium ChromeDriver to scrape titles and anchor elements from linked websites recursively. Ensure Selenium and necessary browser drivers are installed when running locally. ```python import time from urllib.parse import urljoin from apify_client import ApifyClient from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By # Apify client setup client = ApifyClient("YOUR_APIFY_API_TOKEN") # Selenium setup chrome_options = Options() chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") driver = webdriver.Chrome(options=chrome_options) def scrape_page(url): """Scrapes a single page using Selenium.""" print(f"Scraping: {url}") driver.get(url) time.sleep(2) # Wait for the page to load page_data = { "url": url, "title": driver.title, "links": [] } # Find all anchor elements links = driver.find_elements(By.TAG_NAME, "a") for link in links: href = link.get_attribute("href") if href: absolute_url = urljoin(url, href) page_data["links"].append(absolute_url) return page_data def main(): """Main function to run the scraping Actor.""" # Get input from Actor input input_data = client.user("me").get_storage_info().get("input") or {} start_urls = input_data.get("startUrls", ["https://apify.com"]) max_depth = input_data.get("maxDepth", 2) scraped_urls = set() urls_to_visit = [(url, 0) for url in start_urls] results = [] while urls_to_visit: current_url, current_depth = urls_to_visit.pop(0) if current_url in scraped_urls or current_depth > max_depth: continue try: page_info = scrape_page(current_url) results.append(page_info) scraped_urls.add(current_url) if current_depth < max_depth: for link in page_info["links"]: if link not in scraped_urls: urls_to_visit.append((link, current_depth + 1)) except Exception as e: print(f"Error scraping {current_url}: {e}") # Save results to default key-value store dataset_client = client.dataset("results") dataset_client.push_items(results) print("Scraping finished.") # Close the browser driver.quit() if __name__ == "__main__": main() ``` -------------------------------- ### Example Playwright Actor for Recursive Scraping Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/03_guides/03_playwright.mdx This Python Actor uses Playwright to scrape titles from linked websites recursively. It opens pages in an automated Chrome browser, extracts titles and anchor elements after page load, and respects a maximum scraping depth. Ensure Playwright is installed. ```python import asyncio import logging from urllib.parse import urljoin from apify_client import ApifyClient from playwright.sync_api import sync_playwright # Configure logging logging.basicConfig(level=logging.INFO) # Apify Client initialization # You can also use ApifyClient(token='YOUR_APIFY_TOKEN') or ApifyClient(api_url='YOUR_API_URL') client = ApifyClient() # Actor input INPUT = { "startUrls": [ { "url": "https://apify.com" } ], "maxDepth": 2 } def main(): # Initialize Playwright with sync_playwright() as p: # Launch browser (Chromium in this case) browser = p.chromium.launch() page = browser.new_page() # Process URLs from input for start_url_info in INPUT["startUrls"]: url = start_url_info["url"] logging.info(f"Scraping: {url}") scrape_page(page, url, 0, INPUT["maxDepth"]) # Close the browser browser.close() def scrape_page(page, url, depth, max_depth): if depth > max_depth: return try: # Navigate to the URL page.goto(url) # Extract page title title = page.title() logging.info(f" Title: {title}") # Extract links links = page.locator("a").all() for link in links: href = link.get_attribute("href") if href: # Resolve relative URLs absolute_url = urljoin(url, href) # Basic check to avoid mailto, tel, etc. and stay on the same domain if absolute_url.startswith("http") and url in absolute_url: logging.info(f" Found link: {absolute_url}") # Recursively scrape the linked page scrape_page(page, absolute_url, depth + 1, max_depth) except Exception as e: logging.error(f"Error scraping {url}: {e}") if __name__ == "__main__": main() ``` -------------------------------- ### Initialize Actor with Context Manager Source: https://context7.com/apify/apify-sdk-python/llms.txt Use the async context manager for automatic initialization, configuration loading, and graceful shutdown. Recommended for most Actors. ```python import asyncio from apify import Actor async def main() -> None: async with Actor: # Get input from the default key-value store actor_input = await Actor.get_input() or {} Actor.log.info(f'Actor input: {actor_input}') # Your Actor logic here data = {'message': 'Hello from Actor!', 'input': actor_input} await Actor.push_data(data) # Set status message visible in Apify Console await Actor.set_status_message('Actor completed successfully') if __name__ == '__main__': asyncio.run(main()) ``` -------------------------------- ### Python Scrapy Actor Example: items.py Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/03_guides/06_scrapy.mdx Defines the structure of scraped items. This example defines a simple 'TitleItem' with a 'title' field. ```python import scrapy class TitleItem(scrapy.Item): title = scrapy.Field() ``` -------------------------------- ### Python Apify Proxy Configuration Source: https://context7.com/apify/apify-sdk-python/llms.txt Demonstrates how to create and use Apify Proxy configurations for rotating IP addresses. Shows how to get a new proxy URL and use it with `httpx` for making requests. Includes session support for sticky IPs. ```python import asyncio import httpx from apify import Actor async def main() -> None: async with Actor: # Create Apify Proxy configuration proxy_cfg = await Actor.create_proxy_configuration() if not proxy_cfg: raise RuntimeError('No proxy configuration available.') # Get a proxy URL (rotates automatically) proxy_url = await proxy_cfg.new_url() Actor.log.info(f'Using proxy: {proxy_url}') # Use with session for sticky IP session_proxy = await proxy_cfg.new_url(session_id='my-session') # Use proxy with HTTPX async with httpx.AsyncClient(proxy=proxy_url) as client: response = await client.get('https://api.ipify.org') Actor.log.info(f'IP: {response.text}') if __name__ == '__main__': asyncio.run(main()) ``` -------------------------------- ### Example Scrapy Actor: items.py Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/06_scrapy.mdx Defines the structure of scraped items. In this example, it defines a 'Page' item with 'url' and 'title' fields. ```python import scrapy class Page(scrapy.Item): url = scrapy.Field() title = scrapy.Field() ``` -------------------------------- ### Initialize BasicCrawler with Actor Services Source: https://github.com/apify/apify-sdk-python/blob/master/docs/04_upgrading/upgrading_to_v3.md Demonstrates initializing a BasicCrawler that inherits services from the Actor and global service_locator. Also shows how to initialize a crawler with custom services. ```python from crawlee.crawlers import BasicCrawler from crawlee.storage_clients import MemoryStorageClient from crawlee.configuration import Configuration from crawlee.events import LocalEventManager from apify import Actor async def main(): async with Actor(): # This crawler will use same services as Actor and global service_locator crawler_1 = BasicCrawler() # This crawler will use custom services custom_configuration = Configuration() custom_event_manager = LocalEventManager.from_config(custom_configuration) custom_storage_client = MemoryStorageClient() crawler_2 = BasicCrawler( configuration=custom_configuration, event_manager=custom_event_manager, storage_client=custom_storage_client, ) ``` -------------------------------- ### Create a new Actor with Apify CLI Source: https://github.com/apify/apify-sdk-python/blob/master/docs/01_introduction/quick-start.mdx Use the `apify create` command with a Python template to initialize a new Actor project. This sets up the project structure, virtual environment, and dependencies. ```bash apify create my-first-actor --template python-start ``` -------------------------------- ### Actor entrypoint in __main__.py Source: https://github.com/apify/apify-sdk-python/blob/master/docs/01_introduction/quick-start.mdx This file serves as the entrypoint for the Actor package. It sets up the logger and executes the main function using `asyncio.run()`. ```python import logging import asyncio from src import main logging.basicConfig(level=logging.INFO) if __name__ == "__main__": asyncio.run(main.main()) ``` -------------------------------- ### Key-Value Store Read and Write - Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/03_storages.mdx Shows how to store and retrieve data from a key-value store. Key-value stores are suitable for storing Actor state, configuration, or binary files. ```python from apify import Actor kvs = await Actor.open_key_value_store() # Store data in the key-value store await kvs.put('my-key', {'data': 'some value'}) await kvs.put_record('another-key', 'plain text value') # Retrieve data from the key-value store retrieved_data = await kvs.get('my-key') retrieved_text = await kvs.get_record('another-key') print(retrieved_data) print(retrieved_text) ``` -------------------------------- ### Read and Write Data to a Dataset in Python Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/03_storages.mdx Demonstrates pushing data to a dataset and reading data from it using push_data and get_data methods. ```python from apify_client import ApifyClient # Push data to the default dataset await Actor.push_data([{'data': 1}, {'data': 2}]) # Read data from the default dataset all_data = await Actor.get_data() # Iterate over items in the default dataset async for item in Actor.iterate_items(): print(item) ``` -------------------------------- ### Running E2E Tests Source: https://github.com/apify/apify-sdk-python/blob/master/tests/e2e/README.md Set the API token and run the E2E tests using the `uv` command. Optionally, set the API URL for different environments. ```bash export APIFY_TEST_USER_API_TOKEN= uv run poe e2e-tests ``` -------------------------------- ### Read and write key-value store records Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/03_storages.mdx Demonstrates reading and writing data to a key-value store. This storage is suitable for storing arbitrary data, such as configuration, state, or binary files. ```python from apify import Actor actor = Actor() key_value_store = await actor.open_key_value_store() # Write data to the key-value store await key_value_store.set_value("user-settings", {"theme": "dark", "notifications": True}) await key_value_store.set_value("profile-picture.jpg", b"\x89PNG...", content_type="image/jpeg") # Read data from the key-value store settings = await key_value_store.get_value("user-settings") print(f"User settings: {settings}") image_data = await key_value_store.get_value("profile-picture.jpg", content_type="image/jpeg") print(f"Read image data: {len(image_data)} bytes") ``` -------------------------------- ### Working with Key-Value Stores - Iterating Keys Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/03_storages.mdx Get an iterator of record keys from a key-value store using the `iterate_keys` method. ```APIDOC ### Iterating keys To get an iterator of the key-value store record keys, you can use the `KeyValueStore.iterate_keys` method. ``` -------------------------------- ### Example Scrapy Actor: main.py Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/06_scrapy.mdx This file is typically empty in a Scrapy Actor project, as the main execution logic is handled by __main__.py. ```python # This file is intentionally left blank. # The main execution logic is in __main__.py. ``` -------------------------------- ### Configure Apify Proxy Groups and Countries Source: https://github.com/apify/apify-sdk-python/blob/master/docs/02_concepts/05_proxy_management.mdx Example of configuring Apify Proxy to use specific proxy groups and countries for connections. ```python from apify_client import ApifyClient client = ApifyClient("YOUR_APIFY_API_TOKEN") # Use Apify Proxy with specific groups and countries run_input = { "proxy": { "useApifyProxy": True, "groups": ["RESIDENTIAL", "BUSINESS"], "country": "US" } } client.actor("some/actor").call(run_input=run_input) ``` -------------------------------- ### Scrapy Actor Entry Point (__main__.py) Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/03_guides/06_scrapy.mdx This file serves as the entry point for an Apify Actor running a Scrapy project. It initializes logging and runs the Scrapy spider using the Apify SDK's integration function. Ensure the SCRAPY_SETTINGS_MODULE environment variable is set. ```python import asyncio from apify.log import initialize_logging from apify.scrapy import run_scrapy_actor def main(): # Initialize Apify logging initialize_logging() # Run the Scrapy spider as an Apify Actor asyncio.run(run_scrapy_actor()) if __name__ == '__main__': main() ``` -------------------------------- ### Scrapy Actor Entry Point (__main__.py) Source: https://github.com/apify/apify-sdk-python/blob/master/docs/03_guides/06_scrapy.mdx This file serves as the entry point for an Apify Actor running a Scrapy project. It initializes logging and runs the Scrapy Actor using `apify.scrapy.run_scrapy_actor`, which handles the integration of Twisted and asyncio event loops. ```python import asyncio from apify.log import initialize_logging from apify.scrapy import run_scrapy_actor def main(): initialize_logging() asyncio.run(run_scrapy_actor()) if __name__ == '__main__': main() ``` -------------------------------- ### Open storages by name Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/03_storages.mdx Shows how to open specific storages (dataset, key-value store, request queue) by their names, not just the default ones. This is useful for managing multiple storages. ```python from apify import Actor actor = Actor() # Open storages by name dataset = await actor.open_dataset(dataset_id="my-dataset") key_value_store = await actor.open_key_value_store(name="my-kvs") request_queue = await actor.open_request_queue(name="my-rq") ``` -------------------------------- ### Assertion Message Example Source: https://github.com/apify/apify-sdk-python/blob/master/tests/e2e/README.md When writing assertions inside Actors, include explicit messages to aid in debugging, as only a bare `AssertionError` is shown on failure. ```python assert is_finished is False, f'is_finished={is_finished}' ``` -------------------------------- ### Open Storage Using Alias in Python Source: https://github.com/apify/apify-sdk-python/blob/master/website/versioned_docs/version-3.3/02_concepts/03_storages.mdx Demonstrates opening a storage using an alias, which creates a run-scoped, human-readable reference. This is mutually exclusive with `id` and `name`. ```python from apify_client import ApifyClient # Example using Actor class (if running within an Actor environment) # from apify import Actor # For local testing, initialize ApifyClient apify_client = ApifyClient("YOUR_APIFY_API_TOKEN") # Open a key-value store using an alias kvs_alias = apify_client.key_value_store(alias="my-run-scoped-kvs") # Or using Actor class: # kvs_alias = Actor.open_key_value_store(alias="my-run-scoped-kvs") # You can now use kvs_alias to interact with this storage within the current run kvs_alias.set_value("key", "value") print("Storage opened with alias 'my-run-scoped-kvs'.") ```