Test program.
import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration
# If these go out of service then replace them with your own.
proxies = ['http://178.48.68.61:18080', 'http://198.245.60.202:3128', 'http://15.204.240.177:3128',]
proxy_configuration_fails = ProxyConfiguration(proxy_urls=proxies)
proxy_configuration_succeeds = ProxyConfiguration(
tiered_proxy_urls=[
# No proxy tier. (Not needed, but optional in case you do not want to use any proxy on lowest tier.)
[None],
# lower tier, cheaper, preferred as long as they work
proxies,
# higher tier, more expensive, used as a fallback
]
)
async def main() -> None:
crawler = PlaywrightCrawler(
max_requests_per_crawl=5, # Limit the crawl to 5 requests.
headless=False, # Show the browser window.
browser_type='firefox', # Use the Firefox browser.
proxy_configuration = proxy_configuration_fails,
# proxy_configuration=proxy_configuration_succeeds,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Enqueue all links found on the page.
await context.enqueue_links()
# Extract data from the page using Playwright API.
data = {
'url': context.request.url,
'title': await context.page.title(),
'content': (await context.page.content())[:100],
}
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of URLs.
await crawler.run(['https://crawlee.dev'])
# Export the entire dataset to a JSON file.
await crawler.export_data('results.json')
# Or work with the data directly.
data = await crawler.get_data()
crawler.log.info(f'Extracted data: {data.items}')
if __name__ == '__main__':
asyncio.run(main())
Terminal output.
/Users/matecsaj/PycharmProjects/wat-crawlee/venv/bin/python /Users/matecsaj/Library/Application Support/JetBrains/PyCharm2024.3/scratches/scratch_4.py
[crawlee._autoscaling.snapshotter] INFO Setting max_memory_size of this run to 8.00 GB.
[crawlee.crawlers._playwright._playwright_crawler] INFO Current request statistics:
┌───────────────────────────────┬──────────┐
│ requests_finished │ 0 │
│ requests_failed │ 0 │
│ retry_histogram │ [0] │
│ request_avg_failed_duration │ None │
│ request_avg_finished_duration │ None │
│ requests_finished_per_minute │ 0 │
│ requests_failed_per_minute │ 0 │
│ request_total_duration │ 0.0 │
│ requests_total │ 0 │
│ crawler_runtime │ 0.038974 │
└───────────────────────────────┴──────────┘
[crawlee._autoscaling.autoscaled_pool] INFO current_concurrency = 0; desired_concurrency = 2; cpu = 0.0; mem = 0.0; event_loop = 0.0; client_info = 0.0
[crawlee.crawlers._playwright._playwright_crawler] ERROR Request failed and reached maximum retries
Traceback (most recent call last):
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/crawlers/_basic/_context_pipeline.py", line 65, in __call__
result = await middleware_instance.__anext__()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/crawlers/_playwright/_playwright_crawler.py", line 138, in _open_page
crawlee_page = await self._browser_pool.new_page(proxy_info=context.proxy_info)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/_utils/context.py", line 38, in async_wrapper
return await method(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/browsers/_browser_pool.py", line 241, in new_page
return await self._get_new_page(page_id, plugin, proxy_info)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/browsers/_browser_pool.py", line 270, in _get_new_page
page = await asyncio.wait_for(
^^^^^^^^^^^^^^^^^^^^^^^
...<5 lines>...
)
^
File "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/asyncio/tasks.py", line 507, in wait_for
return await fut
^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/browsers/_playwright_browser_controller.py", line 119, in new_page
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/matecsaj/PycharmProjects/wat-crawlee/venv/lib/python3.13/site-packages/crawlee/browsers/_playwright_browser_controller.py", line 174, in _create_browser_context
if browser_new_context_options['proxy']:
~~~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^
KeyError: 'proxy'
[crawlee._autoscaling.autoscaled_pool] INFO Waiting for remaining tasks to finish
[crawlee.crawlers._playwright._playwright_crawler] INFO Error analysis: total_errors=3 unique_errors=1
[crawlee.crawlers._playwright._playwright_crawler] INFO Final request statistics:
┌───────────────────────────────┬───────────┐
│ requests_finished │ 0 │
│ requests_failed │ 1 │
│ retry_histogram │ [0, 0, 1] │
│ request_avg_failed_duration │ 0.025703 │
│ request_avg_finished_duration │ None │
│ requests_finished_per_minute │ 0 │
│ requests_failed_per_minute │ 14 │
│ request_total_duration │ 0.025703 │
│ requests_total │ 1 │
│ crawler_runtime │ 4.189647 │
└───────────────────────────────┴───────────┘
[crawlee.storages._dataset] WARN Attempting to export an empty dataset - no file will be created
[crawlee.crawlers._playwright._playwright_crawler] INFO Extracted data: []
Process finished with exit code 0
Test program.
Terminal output.