import asyncio
from crawlee.playwright_crawler import (
PlaywrightCrawler,
PlaywrightCrawlingContext,
)
async def main() -> None:
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
context.log.info(f"Processing {context.request.url} ...")
# Extract data from the page.
data = {
"url": context.request.url,
"title": context.page.title.string if context.page.title else None,
}
# Enqueue all links found on the page.
await context.enqueue_links()
# Push the extracted data to the default dataset.
await context.push_data(data)
# Run the crawler with the initial list of URLs.
await crawler.run(["https://crawlee.dev"])
# Export the entire dataset to a CSV file.
await crawler.export_data("results.csv")
if __name__ == "__main__":
asyncio.run(main())
Traceback (most recent call last):
File "g:\python\python_experiment\my-crawler\my-crawler\routes.py", line 40, in <module>
asyncio.run(main())
File "D:\python3.9\lib\asyncio\runners.py", line 44, in run
return loop.run_until_complete(main)
File "D:\python3.9\lib\asyncio\base_events.py", line 642, in run_until_complete
return future.result()
File "g:\python\python_experiment\my-crawler\my-crawler\routes.py", line 36, in main
await crawler.export_data("results.csv")
File "g:\python\python_experiment\venv\lib\site-packages\crawlee\basic_crawler\basic_crawler.py", line 474, in export_data
return await dataset.write_to(content_type, path.open('w', newline=''))
File "g:\python\python_experiment\venv\lib\site-packages\crawlee\storages\dataset.py", line 213, in write_to
writer.writerows([items[0].keys(), *[item.values() for item in items]])
IndexError: list index out of range
TERMINAL: