From 44ce12c62c5c02421cd760c89d8ffda9dd59c208 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 9 Sep 2024 13:13:34 +0530 Subject: [PATCH 01/20] Created scaffolding for Scraper as per the plan. Implemented the ascrape method in bfs_scraper_strategy --- crawl4ai/scraper/__init__.py | 0 crawl4ai/scraper/async_web_scraper.py | 36 +++++++++++++ crawl4ai/scraper/bfs_scraper_strategy.py | 50 +++++++++++++++++++ crawl4ai/scraper/filters/__init__.py | 3 ++ .../scraper/filters/content_type_filter.py | 8 +++ crawl4ai/scraper/filters/url_filter.py | 16 ++++++ .../scraper/filters/url_pattern_filter.py | 9 ++++ crawl4ai/scraper/models.py | 7 +++ crawl4ai/scraper/scorers/__init__.py | 2 + .../scorers/keyword_relevance_scorer.py | 9 ++++ crawl4ai/scraper/scorers/url_scorer.py | 6 +++ crawl4ai/scraper/scraper_strategy.py | 9 ++++ 12 files changed, 155 insertions(+) create mode 100644 crawl4ai/scraper/__init__.py create mode 100644 crawl4ai/scraper/async_web_scraper.py create mode 100644 crawl4ai/scraper/bfs_scraper_strategy.py create mode 100644 crawl4ai/scraper/filters/__init__.py create mode 100644 crawl4ai/scraper/filters/content_type_filter.py create mode 100644 crawl4ai/scraper/filters/url_filter.py create mode 100644 crawl4ai/scraper/filters/url_pattern_filter.py create mode 100644 crawl4ai/scraper/models.py create mode 100644 crawl4ai/scraper/scorers/__init__.py create mode 100644 crawl4ai/scraper/scorers/keyword_relevance_scorer.py create mode 100644 crawl4ai/scraper/scorers/url_scorer.py create mode 100644 crawl4ai/scraper/scraper_strategy.py diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py new file mode 100644 index 00000000..c67f0e14 --- /dev/null +++ b/crawl4ai/scraper/async_web_scraper.py @@ -0,0 +1,36 @@ +import asyncio +from typing import List, Dict +from .scraper_strategy import ScraperStrategy +from .bfs_scraper_strategy import BFSScraperStrategy +from .models import ScraperResult +from ..async_webcrawler import AsyncWebCrawler + +class BatchProcessor: + def __init__(self, batch_size: int, concurrency_limit: int): + self.batch_size = batch_size + self.concurrency_limit = concurrency_limit + + async def process_batch(self, scraper: 'AsyncWebScraper', urls: List[str]) -> List[ScraperResult]: + semaphore = asyncio.Semaphore(self.concurrency_limit) + async def scrape_with_semaphore(url): + async with semaphore: + return await scraper.ascrape(url) + return await asyncio.gather(*[scrape_with_semaphore(url) for url in urls]) + +class AsyncWebScraper: + def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5): + self.crawler = crawler + self.strategy = strategy + self.batch_processor = BatchProcessor(batch_size, concurrency_limit) + + async def ascrape(self, url: str) -> ScraperResult: + crawl_result = await self.crawler.arun(url) + return await self.strategy.ascrape(url, crawl_result, self.crawler) + + async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: + all_results = [] + for i in range(0, len(urls), self.batch_processor.batch_size): + batch = urls[i:i+self.batch_processor.batch_size] + batch_results = await self.batch_processor.process_batch(self, batch) + all_results.extend(batch_results) + return all_results \ No newline at end of file diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py new file mode 100644 index 00000000..9add962e --- /dev/null +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -0,0 +1,50 @@ +from .scraper_strategy import ScraperStrategy +from .filters import FilterChain +from .scorers import URLScorer +from .models import ScraperResult +from ..models import CrawlResult +from ..async_webcrawler import AsyncWebCrawler +import asyncio +from urllib.parse import urljoin + +class BFSScraperStrategy(ScraperStrategy): + def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer): + self.max_depth = max_depth + self.filter_chain = filter_chain + self.url_scorer = url_scorer + + async def ascrape(self, start_url: str, initial_crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + queue = asyncio.PriorityQueue() + queue.put_nowait((0, 0, start_url)) # (score, depth, url) + visited = set() + crawled_urls = [] + extracted_data = {} + + while not queue.empty(): + _, depth, url = await queue.get() + if depth > self.max_depth or url in visited: + continue + crawl_result = initial_crawl_result if url == start_url else await crawler.arun(url) + visited.add(url) + crawled_urls.append(url) + extracted_data[url]=crawl_result + if crawl_result.success == False: + print(f"failed to crawl -- {url}") + continue + for internal in crawl_result.links["internal"]: + link = internal['href'] + is_special_uri = any(link.startswith(scheme) for scheme in ('tel:', 'mailto:', 'sms:', 'geo:', 'fax:', 'file:', 'data:', 'sip:', 'ircs:', 'magnet:')) + is_fragment = '#' in link + if not (is_fragment or is_special_uri): + # To fix partial links: eg:'/support' to 'https://example.com/support' + absolute_link = urljoin(url, link) + if self.filter_chain.apply(absolute_link) and absolute_link not in visited: + score = self.url_scorer.score(absolute_link) + await queue.put((1 / score, depth + 1, absolute_link)) + for external in crawl_result.links["external"]: + link = external['href'] + if self.filter_chain.apply(link) and link not in visited: + score = self.url_scorer.score(link) + await queue.put((1 / score, depth + 1, link)) + + return ScraperResult(url=start_url, crawled_urls=crawled_urls, extracted_data=extracted_data) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/__init__.py b/crawl4ai/scraper/filters/__init__.py new file mode 100644 index 00000000..525c9bdb --- /dev/null +++ b/crawl4ai/scraper/filters/__init__.py @@ -0,0 +1,3 @@ +from .url_filter import URLFilter, FilterChain +from .content_type_filter import ContentTypeFilter +from .url_pattern_filter import URLPatternFilter \ No newline at end of file diff --git a/crawl4ai/scraper/filters/content_type_filter.py b/crawl4ai/scraper/filters/content_type_filter.py new file mode 100644 index 00000000..9173eb4a --- /dev/null +++ b/crawl4ai/scraper/filters/content_type_filter.py @@ -0,0 +1,8 @@ +from .url_filter import URLFilter + +class ContentTypeFilter(URLFilter): + def __init__(self, contentType: str): + self.contentType = contentType + def apply(self, url: str) -> bool: + #TODO: This is a stub. Will implement this later + return True \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_filter.py b/crawl4ai/scraper/filters/url_filter.py new file mode 100644 index 00000000..2b8bd6eb --- /dev/null +++ b/crawl4ai/scraper/filters/url_filter.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + +class URLFilter(ABC): + @abstractmethod + def apply(self, url: str) -> bool: + pass + +class FilterChain: + def __init__(self): + self.filters = [] + + def add_filter(self, filter: URLFilter): + self.filters.append(filter) + + def apply(self, url: str) -> bool: + return all(filter.apply(url) for filter in self.filters) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_pattern_filter.py b/crawl4ai/scraper/filters/url_pattern_filter.py new file mode 100644 index 00000000..fd5df133 --- /dev/null +++ b/crawl4ai/scraper/filters/url_pattern_filter.py @@ -0,0 +1,9 @@ +from .url_filter import URLFilter +from re import Pattern + +class URLPatternFilter(URLFilter): + def __init__(self, pattern: Pattern): + self.pattern = pattern + def apply(self, url: str) -> bool: + #TODO: This is a stub. Will implement this later. + return True \ No newline at end of file diff --git a/crawl4ai/scraper/models.py b/crawl4ai/scraper/models.py new file mode 100644 index 00000000..9ffdac52 --- /dev/null +++ b/crawl4ai/scraper/models.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel +from typing import List, Dict + +class ScraperResult(BaseModel): + url: str + crawled_urls: List[str] + extracted_data: Dict \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/__init__.py b/crawl4ai/scraper/scorers/__init__.py new file mode 100644 index 00000000..05c61c94 --- /dev/null +++ b/crawl4ai/scraper/scorers/__init__.py @@ -0,0 +1,2 @@ +from .url_scorer import URLScorer +from .keyword_relevance_scorer import KeywordRelevanceScorer \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py new file mode 100644 index 00000000..a2338aec --- /dev/null +++ b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py @@ -0,0 +1,9 @@ +from .url_scorer import URLScorer +from typing import List + +class KeywordRelevanceScorer(URLScorer): + def __init__(self,keywords: List[str]): + self.keyworkds = keywords + def score(self, url: str) -> float: + #TODO: This is a stub. Will implement this later. + return 1 \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/url_scorer.py b/crawl4ai/scraper/scorers/url_scorer.py new file mode 100644 index 00000000..6ee9ab05 --- /dev/null +++ b/crawl4ai/scraper/scorers/url_scorer.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + +class URLScorer(ABC): + @abstractmethod + def score(self, url: str) -> float: + pass \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py new file mode 100644 index 00000000..16df9ece --- /dev/null +++ b/crawl4ai/scraper/scraper_strategy.py @@ -0,0 +1,9 @@ +from abc import ABC, abstractmethod +from .models import ScraperResult +from ..models import CrawlResult +from ..async_webcrawler import AsyncWebCrawler + +class ScraperStrategy(ABC): + @abstractmethod + async def ascrape(self, url: str, crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + pass \ No newline at end of file From 7f3e2e47ed99de1db4bc99d69d0a6f1ddaef962f Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 19 Sep 2024 12:34:12 +0530 Subject: [PATCH 02/20] Parallel processing with retry on failure with exponential backoff - Simplified URL validation and normalisation - respecting Robots.txt --- crawl4ai/scraper/__init__.py | 2 + crawl4ai/scraper/async_web_scraper.py | 3 +- crawl4ai/scraper/bfs_scraper_strategy.py | 139 ++++++++++++++++++----- crawl4ai/scraper/models.py | 3 +- crawl4ai/scraper/scraper_strategy.py | 2 +- 5 files changed, 116 insertions(+), 33 deletions(-) diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py index e69de29b..1997e162 100644 --- a/crawl4ai/scraper/__init__.py +++ b/crawl4ai/scraper/__init__.py @@ -0,0 +1,2 @@ +from .async_web_scraper import AsyncWebScraper +from .bfs_scraper_strategy import BFSScraperStrategy \ No newline at end of file diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index c67f0e14..6cf5488c 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -24,8 +24,7 @@ class AsyncWebScraper: self.batch_processor = BatchProcessor(batch_size, concurrency_limit) async def ascrape(self, url: str) -> ScraperResult: - crawl_result = await self.crawler.arun(url) - return await self.strategy.ascrape(url, crawl_result, self.crawler) + return await self.strategy.ascrape(url, self.crawler) async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: all_results = [] diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 9add962e..a8fb1fe1 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -5,46 +5,127 @@ from .models import ScraperResult from ..models import CrawlResult from ..async_webcrawler import AsyncWebCrawler import asyncio -from urllib.parse import urljoin +import validators +from urllib.parse import urljoin,urlparse,urlunparse +from urllib.robotparser import RobotFileParser +import time +from aiolimiter import AsyncLimiter +from tenacity import retry, stop_after_attempt, wait_exponential +from collections import defaultdict +import logging +logging.basicConfig(level=logging.DEBUG) + +rate_limiter = AsyncLimiter(1, 1) # 1 request per second class BFSScraperStrategy(ScraperStrategy): - def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer): + def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer + self.max_concurrent = max_concurrent + # 9. Crawl Politeness + self.last_crawl_time = defaultdict(float) + self.min_crawl_delay = 1 # 1 second delay between requests to the same domain + # 5. Robots.txt Compliance + self.robot_parsers = {} + + # Robots.txt Parser + def get_robot_parser(self, url: str) -> RobotFileParser: + domain = urlparse(url).netloc + if domain not in self.robot_parsers: + rp = RobotFileParser() + rp.set_url(f"https://{domain}/robots.txt") + rp.read() + self.robot_parsers[domain] = rp + return self.robot_parsers[domain] + + # Retry with exponential backoff + @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) + async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: + return await crawler.arun(url) + + async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set) -> CrawlResult: + def normalize_url(url: str) -> str: + parsed = urlparse(url) + return urlunparse(parsed._replace(fragment="")) + + # URL Validation + if not validators.url(url): + logging.warning(f"Invalid URL: {url}") + return None + + # Robots.txt Compliance + if not self.get_robot_parser(url).can_fetch("YourUserAgent", url): + logging.info(f"Skipping {url} as per robots.txt") + return None + + # Crawl Politeness + domain = urlparse(url).netloc + time_since_last_crawl = time.time() - self.last_crawl_time[domain] + if time_since_last_crawl < self.min_crawl_delay: + await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl) + self.last_crawl_time[domain] = time.time() - async def ascrape(self, start_url: str, initial_crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + # Rate Limiting + async with rate_limiter: + # Error Handling + try: + crawl_result = await self.retry_crawl(crawler, url) + except Exception as e: + logging.error(f"Error crawling {url}: {str(e)}") + crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e)) + + if not crawl_result.success: + # Logging and Monitoring + logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}") + # Error Categorization + if crawl_result.status_code == 404: + self.remove_from_future_crawls(url) + elif crawl_result.status_code == 503: + await self.add_to_retry_queue(url) + return crawl_result + + # Content Type Checking + # if 'text/html' not in crawl_result.response_header.get('Content-Type', ''): + # logging.info(f"Skipping non-HTML content: {url}") + # return crawl_result + + visited.add(url) + + # Process links + for link_type in ["internal", "external"]: + for link in crawl_result.links[link_type]: + absolute_link = urljoin(url, link['href']) + normalized_link = normalize_url(absolute_link) + if self.filter_chain.apply(normalized_link) and normalized_link not in visited: + new_depth = depth + 1 + if new_depth <= self.max_depth: + # URL Scoring + score = self.url_scorer.score(normalized_link) + await queue.put((score, new_depth, normalized_link)) + + return crawl_result + + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler) -> ScraperResult: queue = asyncio.PriorityQueue() - queue.put_nowait((0, 0, start_url)) # (score, depth, url) + queue.put_nowait((0, 0, start_url)) visited = set() crawled_urls = [] extracted_data = {} while not queue.empty(): - _, depth, url = await queue.get() - if depth > self.max_depth or url in visited: - continue - crawl_result = initial_crawl_result if url == start_url else await crawler.arun(url) - visited.add(url) - crawled_urls.append(url) - extracted_data[url]=crawl_result - if crawl_result.success == False: - print(f"failed to crawl -- {url}") - continue - for internal in crawl_result.links["internal"]: - link = internal['href'] - is_special_uri = any(link.startswith(scheme) for scheme in ('tel:', 'mailto:', 'sms:', 'geo:', 'fax:', 'file:', 'data:', 'sip:', 'ircs:', 'magnet:')) - is_fragment = '#' in link - if not (is_fragment or is_special_uri): - # To fix partial links: eg:'/support' to 'https://example.com/support' - absolute_link = urljoin(url, link) - if self.filter_chain.apply(absolute_link) and absolute_link not in visited: - score = self.url_scorer.score(absolute_link) - await queue.put((1 / score, depth + 1, absolute_link)) - for external in crawl_result.links["external"]: - link = external['href'] - if self.filter_chain.apply(link) and link not in visited: - score = self.url_scorer.score(link) - await queue.put((1 / score, depth + 1, link)) + tasks = [] + while not queue.empty() and len(tasks) < self.max_concurrent: + _, depth, url = await queue.get() + if url not in visited: + task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited)) + tasks.append(task) + + if tasks: + results = await asyncio.gather(*tasks) + for result in results: + if result: + crawled_urls.append(result.url) + extracted_data[result.url] = result return ScraperResult(url=start_url, crawled_urls=crawled_urls, extracted_data=extracted_data) \ No newline at end of file diff --git a/crawl4ai/scraper/models.py b/crawl4ai/scraper/models.py index 9ffdac52..735d1d58 100644 --- a/crawl4ai/scraper/models.py +++ b/crawl4ai/scraper/models.py @@ -1,7 +1,8 @@ from pydantic import BaseModel from typing import List, Dict +from ..models import CrawlResult class ScraperResult(BaseModel): url: str crawled_urls: List[str] - extracted_data: Dict \ No newline at end of file + extracted_data: Dict[str,CrawlResult] \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index 16df9ece..6d1cdc74 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -5,5 +5,5 @@ from ..async_webcrawler import AsyncWebCrawler class ScraperStrategy(ABC): @abstractmethod - async def ascrape(self, url: str, crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + async def ascrape(self, url: str, crawler: AsyncWebCrawler) -> ScraperResult: pass \ No newline at end of file From 7fe220dbd587bc72a0ccaf27ad74af48ca747d6e Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 3 Oct 2024 11:17:11 +0530 Subject: [PATCH 03/20] 1. Introduced a bool flag to ascrape method to switch between sequential and concurrent processing 2. Introduced a dictionary for depth tracking across various tasks 3. Removed redundancy with crawled_urls variable. Instead created a list with visited set variable in returned object. --- crawl4ai/scraper/async_web_scraper.py | 4 ++-- crawl4ai/scraper/bfs_scraper_strategy.py | 24 +++++++++++++++--------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index 6cf5488c..fadfa61f 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -23,8 +23,8 @@ class AsyncWebScraper: self.strategy = strategy self.batch_processor = BatchProcessor(batch_size, concurrency_limit) - async def ascrape(self, url: str) -> ScraperResult: - return await self.strategy.ascrape(url, self.crawler) + async def ascrape(self, url: str, parallel_processing: bool = True) -> ScraperResult: + return await self.strategy.ascrape(url, self.crawler, parallel_processing) async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: all_results = [] diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index a8fb1fe1..a3bb6750 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -13,6 +13,7 @@ from aiolimiter import AsyncLimiter from tenacity import retry, stop_after_attempt, wait_exponential from collections import defaultdict import logging +from typing import Dict logging.basicConfig(level=logging.DEBUG) rate_limiter = AsyncLimiter(1, 1) # 1 request per second @@ -44,7 +45,7 @@ class BFSScraperStrategy(ScraperStrategy): async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: return await crawler.arun(url) - async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set) -> CrawlResult: + async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> CrawlResult: def normalize_url(url: str) -> str: parsed = urlparse(url) return urlunparse(parsed._replace(fragment="")) @@ -98,34 +99,39 @@ class BFSScraperStrategy(ScraperStrategy): absolute_link = urljoin(url, link['href']) normalized_link = normalize_url(absolute_link) if self.filter_chain.apply(normalized_link) and normalized_link not in visited: - new_depth = depth + 1 + new_depth = depths[url] + 1 if new_depth <= self.max_depth: # URL Scoring score = self.url_scorer.score(normalized_link) await queue.put((score, new_depth, normalized_link)) + depths[normalized_link] = new_depth return crawl_result - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler) -> ScraperResult: + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> ScraperResult: queue = asyncio.PriorityQueue() queue.put_nowait((0, 0, start_url)) visited = set() - crawled_urls = [] extracted_data = {} + depths = {start_url: 0} while not queue.empty(): tasks = [] while not queue.empty() and len(tasks) < self.max_concurrent: _, depth, url = await queue.get() if url not in visited: - task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited)) - tasks.append(task) + if parallel_processing: + task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths)) + tasks.append(task) + else: + result = await self.process_url(url, depth, crawler, queue, visited, depths) + if result: + extracted_data[result.url] = result - if tasks: + if parallel_processing and tasks: results = await asyncio.gather(*tasks) for result in results: if result: - crawled_urls.append(result.url) extracted_data[result.url] = result - return ScraperResult(url=start_url, crawled_urls=crawled_urls, extracted_data=extracted_data) \ No newline at end of file + return ScraperResult(url=start_url, crawled_urls=list(visited), extracted_data=extracted_data) \ No newline at end of file From d743adac68fc4c606283428de3451634c1a5e04f Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 3 Oct 2024 15:58:57 +0530 Subject: [PATCH 04/20] Fixed some bugs in robots.txt processing --- crawl4ai/scraper/bfs_scraper_strategy.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index a3bb6750..dc89047a 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -32,13 +32,17 @@ class BFSScraperStrategy(ScraperStrategy): # Robots.txt Parser def get_robot_parser(self, url: str) -> RobotFileParser: - domain = urlparse(url).netloc - if domain not in self.robot_parsers: + domain = urlparse(url) + scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided + netloc = domain.netloc + + if netloc not in self.robot_parsers: rp = RobotFileParser() - rp.set_url(f"https://{domain}/robots.txt") + rp.set_url(f"{scheme}://{netloc}/robots.txt") rp.read() - self.robot_parsers[domain] = rp - return self.robot_parsers[domain] + self.robot_parsers[netloc] = rp + return self.robot_parsers[netloc] + # Retry with exponential backoff @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10)) @@ -56,7 +60,7 @@ class BFSScraperStrategy(ScraperStrategy): return None # Robots.txt Compliance - if not self.get_robot_parser(url).can_fetch("YourUserAgent", url): + if not self.get_robot_parser(url).can_fetch(crawler.crawler_strategy.user_agent, url): logging.info(f"Skipping {url} as per robots.txt") return None From 768b93140f23297fecca99341ebbe9c6798032fc Mon Sep 17 00:00:00 2001 From: hitesh22rana Date: Sat, 5 Oct 2024 00:25:41 +0900 Subject: [PATCH 05/20] docs: fixed css_selector for example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 94dd7a88..9fb7776c 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ async def main(): result = await crawler.arun( url="https://www.nbcnews.com/business", js_code=js_code, - css_selector="article.tease-card", + css_selector=".wide-tease-item__description", bypass_cache=True ) print(result.extracted_content) From ff3524d9b1f76bb06a43a7721eb958db9bd01463 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 12 Oct 2024 13:42:42 +0800 Subject: [PATCH 06/20] feat(v0.3.6): Add screenshot capture, delayed content, and custom timeouts - Implement screenshot capture functionality - Add delayed content retrieval method - Introduce custom page timeout parameter - Enhance LLM support with multiple providers - Improve database schema auto-updates - Optimize image processing in WebScrappingStrategy - Update error handling and logging - Expand examples in quickstart_async.py --- .gitignore | 4 ++- CHANGELOG.md | 33 +++++++++++++++++++ crawl4ai/__init__.py | 2 +- crawl4ai/async_crawler_strategy.py | 34 ++++++++++++++++--- crawl4ai/async_database.py | 21 ++++++++++-- crawl4ai/async_webcrawler.py | 4 +-- crawl4ai/content_scrapping_strategy.py | 6 ++-- docs/examples/quickstart_async.py | 45 ++++++++++++++++++++------ 8 files changed, 127 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index 85882f69..8b8f014c 100644 --- a/.gitignore +++ b/.gitignore @@ -201,4 +201,6 @@ test_env/ todo.md git_changes.py git_changes.md -pypi_build.sh \ No newline at end of file +pypi_build.sh + +.tests/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 37b564ed..701d6903 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,38 @@ # Changelog +## [0.3.6] - 2024-10-12 + +### Added +- New `.tests/` directory added to `.gitignore` +- Screenshot functionality: + - Added `screenshot` column to the database schema + - Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy` + - Added option to capture screenshots when crawling +- Delayed content retrieval: + - New `get_delayed_content` method in `AsyncCrawlResponse` +- Database schema updates: + - Auto-update mechanism for database schema + - New columns: 'media', 'links', 'metadata', 'screenshot' +- LLM extraction examples in `quickstart_async.py`: + - Support for OpenAI, Hugging Face, and Ollama models + +### Changed +- Updated version number to 0.3.6 in `__init__.py` +- Improved error handling and logging in various components +- Enhanced `WebScrappingStrategy` to handle image processing more efficiently +- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values + +### Fixed +- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags + +### Removed +- Removed `pypi_build.sh` from version control (added to `.gitignore`) + +### Developer Notes +- Added examples for using different LLM providers in `quickstart_async.py` +- Improved error messages for better debugging +- Enhanced type hinting throughout the codebase + ## [v0.3.5] - 2024-09-02 Enhance AsyncWebCrawler with smart waiting and screenshot capabilities diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 186730e8..04da30f8 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -3,7 +3,7 @@ from .async_webcrawler import AsyncWebCrawler from .models import CrawlResult -__version__ = "0.3.5" +__version__ = "0.3.6" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 987925f8..28795a3e 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1,7 +1,7 @@ import asyncio import base64, time from abc import ABC, abstractmethod -from typing import Callable, Dict, Any, List, Optional +from typing import Callable, Dict, Any, List, Optional, Awaitable import os from playwright.async_api import async_playwright, Page, Browser, Error from io import BytesIO @@ -18,6 +18,10 @@ class AsyncCrawlResponse(BaseModel): response_headers: Dict[str, str] status_code: int screenshot: Optional[str] = None + get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None + + class Config: + arbitrary_types_allowed = True class AsyncCrawlerStrategy(ABC): @abstractmethod @@ -248,7 +252,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if not kwargs.get("js_only", False): await self.execute_hook('before_goto', page) - response = await page.goto(url, wait_until="domcontentloaded", timeout=60000) + response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)) await self.execute_hook('after_goto', page) # Get status code and headers @@ -295,6 +299,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") + # Check if kwargs has screenshot=True then take screenshot + screenshot_data = None + if kwargs.get("screenshot"): + screenshot_data = await self.take_screenshot(url) + html = await page.content() page = await self.execute_hook('before_return_html', page, html) @@ -312,7 +321,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "status_code": status_code }, f) - response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code) + + async def get_delayed_content(delay: float = 5.0) -> str: + if self.verbose: + print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}") + await asyncio.sleep(delay) + return await page.content() + + response = AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=get_delayed_content + ) return response except Error as e: raise Error(f"Failed to crawl {url}: {str(e)}") @@ -383,11 +405,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): results = await asyncio.gather(*tasks, return_exceptions=True) return [result if not isinstance(result, Exception) else str(result) for result in results] - async def take_screenshot(self, url: str) -> str: + async def take_screenshot(self, url: str, wait_time = 1000) -> str: async with await self.browser.new_context(user_agent=self.user_agent) as context: page = await context.new_page() try: - await page.goto(url, wait_until="domcontentloaded") + await page.goto(url, wait_until="domcontentloaded", timeout=30000) + # Wait for a specified time (default is 1 second) + await page.wait_for_timeout(wait_time) screenshot = await page.screenshot(full_page=True) return base64.b64encode(screenshot).decode('utf-8') except Exception as e: diff --git a/crawl4ai/async_database.py b/crawl4ai/async_database.py index baa53255..61d98e9c 100644 --- a/crawl4ai/async_database.py +++ b/crawl4ai/async_database.py @@ -29,14 +29,31 @@ class AsyncDatabaseManager: ) ''') await db.commit() + await self.update_db_schema() - async def aalter_db_add_screenshot(self, new_column: str = "media"): + async def update_db_schema(self): + async with aiosqlite.connect(self.db_path) as db: + # Check if the 'media' column exists + cursor = await db.execute("PRAGMA table_info(crawled_data)") + columns = await cursor.fetchall() + column_names = [column[1] for column in columns] + + if 'media' not in column_names: + await self.aalter_db_add_column('media') + + # Check for other missing columns and add them if necessary + for column in ['links', 'metadata', 'screenshot']: + if column not in column_names: + await self.aalter_db_add_column(column) + + async def aalter_db_add_column(self, new_column: str): try: async with aiosqlite.connect(self.db_path) as db: await db.execute(f'ALTER TABLE crawled_data ADD COLUMN {new_column} TEXT DEFAULT ""') await db.commit() + print(f"Added column '{new_column}' to the database.") except Exception as e: - print(f"Error altering database to add screenshot column: {e}") + print(f"Error altering database to add {new_column} column: {e}") async def aget_cached_url(self, url: str) -> Optional[Tuple[str, str, str, str, str, str, str, bool, str]]: try: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 88c05f03..d308e930 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -202,11 +202,11 @@ class AsyncWebCrawler: ) if result is None: - raise ValueError(f"Failed to extract content from the website: {url}") + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") except InvalidCSSSelectorError as e: raise ValueError(str(e)) except Exception as e: - raise ValueError(f"Failed to extract content from the website: {url}, error: {str(e)}") + raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) markdown = sanitize_input_encode(result.get("markdown", "")) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index e3d2c57f..afd75892 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -170,10 +170,12 @@ class WebScrappingStrategy(ContentScrappingStrategy): if isinstance(element, Comment): element.extract() return False + + # if element.name == 'img': + # process_image(element, url, 0, 1) + # return True if element.name in ['script', 'style', 'link', 'meta', 'noscript']: - if element.name == 'img': - process_image(element, url, 0, 1) element.decompose() return False diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 1a2d9570..836bdb1d 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -66,6 +66,29 @@ async def use_proxy(): # ) # print(result.markdown[:500]) # Print first 500 characters + +async def capture_and_save_screenshot(url: str, output_path: str): + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url=url, + screenshot=True, + bypass_cache=True + ) + + if result.success and result.screenshot: + import base64 + + # Decode the base64 screenshot data + screenshot_data = base64.b64decode(result.screenshot) + + # Save the screenshot as a JPEG file + with open(output_path, 'wb') as f: + f.write(screenshot_data) + + print(f"Screenshot saved successfully to {output_path}") + else: + print("Failed to capture screenshot") + class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") @@ -73,13 +96,11 @@ class OpenAIModelFee(BaseModel): ..., description="Fee for output token for the OpenAI model." ) -async def extract_structured_data_using_llm(): - print("\n--- Extracting Structured Data with OpenAI ---") - print( - "Note: Set your OpenAI API key as an environment variable to run this example." - ) - if not os.getenv("OPENAI_API_KEY"): - print("OpenAI API key not found. Skipping this example.") +async def extract_structured_data_using_llm(provider: str, api_token: str = None): + print(f"\n--- Extracting Structured Data with {provider} ---") + + if api_token is None and provider != "ollama": + print(f"API token is required for {provider}. Skipping this example.") return async with AsyncWebCrawler(verbose=True) as crawler: @@ -87,8 +108,8 @@ async def extract_structured_data_using_llm(): url="https://openai.com/api/pricing/", word_count_threshold=1, extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token=os.getenv("OPENAI_API_KEY"), + provider=provider, + api_token=api_token, schema=OpenAIModelFee.schema(), extraction_type="schema", instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. @@ -390,7 +411,13 @@ async def main(): await js_and_css() await use_proxy() await extract_structured_data_using_css_extractor() + + # LLM extraction examples await extract_structured_data_using_llm() + await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + await extract_structured_data_using_llm("ollama/llama3.2") + # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_3() From 9b2b267820c79fd9c45094e0d9fece57c82aa533 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 12 Oct 2024 13:42:56 +0800 Subject: [PATCH 07/20] CHANGELOG UPDATE --- CHANGELOG.md | 68 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 701d6903..873af87f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,37 +1,51 @@ # Changelog -## [0.3.6] - 2024-10-12 +## [v0.3.6] - 2024-10-12 -### Added -- New `.tests/` directory added to `.gitignore` -- Screenshot functionality: - - Added `screenshot` column to the database schema - - Implemented `take_screenshot` method in `AsyncPlaywrightCrawlerStrategy` - - Added option to capture screenshots when crawling -- Delayed content retrieval: - - New `get_delayed_content` method in `AsyncCrawlResponse` -- Database schema updates: - - Auto-update mechanism for database schema - - New columns: 'media', 'links', 'metadata', 'screenshot' -- LLM extraction examples in `quickstart_async.py`: - - Support for OpenAI, Hugging Face, and Ollama models +### 1. Screenshot Capture +- **What's new**: Added ability to capture screenshots during crawling. +- **Why it matters**: You can now visually verify the content of crawled pages, which is useful for debugging and content verification. +- **How to use**: Set `screenshot=True` when calling `crawler.arun()`. -### Changed -- Updated version number to 0.3.6 in `__init__.py` -- Improved error handling and logging in various components -- Enhanced `WebScrappingStrategy` to handle image processing more efficiently -- Modified `AsyncPlaywrightCrawlerStrategy` to support custom timeout values +### 2. Delayed Content Retrieval +- **What's new**: Introduced `get_delayed_content` method in `AsyncCrawlResponse`. +- **Why it matters**: Allows you to retrieve content after a specified delay, useful for pages that load content dynamically. +- **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. -### Fixed -- Adjusted image processing in `WebScrappingStrategy` to prevent premature decomposition of img tags +### 3. Custom Page Timeout +- **What's new**: Added `page_timeout` parameter to control page load timeout. +- **Why it matters**: Gives you more control over crawling behavior, especially for slow-loading pages. +- **How to use**: Set `page_timeout=your_desired_timeout` (in milliseconds) when calling `crawler.arun()`. -### Removed -- Removed `pypi_build.sh` from version control (added to `.gitignore`) +### 4. Enhanced LLM Support +- **What's new**: Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama). +- **Why it matters**: Provides more flexibility in choosing AI models for content extraction. +- **How to use**: Specify the desired provider when using `LLMExtractionStrategy`. -### Developer Notes -- Added examples for using different LLM providers in `quickstart_async.py` -- Improved error messages for better debugging -- Enhanced type hinting throughout the codebase +## Improvements + +### 1. Database Schema Auto-updates +- **What's new**: Automatic database schema updates. +- **Why it matters**: Ensures your database stays compatible with the latest version without manual intervention. + +### 2. Enhanced Error Handling +- **What's new**: Improved error messages and logging. +- **Why it matters**: Makes debugging easier with more informative error messages. + +### 3. Optimized Image Processing +- **What's new**: Refined image handling in `WebScrappingStrategy`. +- **Why it matters**: Improves the accuracy of content extraction for pages with images. + +## Bug Fixes + +- Fixed an issue where image tags were being prematurely removed during content extraction. + +## Developer Notes + +- Added examples for using different LLM providers in `quickstart_async.py`. +- Enhanced type hinting throughout the codebase for better development experience. + +We're constantly working to improve crawl4ai. These updates aim to provide you with more control, flexibility, and reliability in your web crawling tasks. As always, we appreciate your feedback and suggestions for future improvements! ## [v0.3.5] - 2024-09-02 From 68e9144ce3c8821849358b48f57e74d7504bb32b Mon Sep 17 00:00:00 2001 From: unclecode Date: Sat, 12 Oct 2024 14:48:22 +0800 Subject: [PATCH 08/20] feat: Enhance crawling control and LLM extraction flexibility - Add before_retrieve_html hook and delay_before_return_html option - Implement flexible page_timeout for smart_wait function - Support extra_args and custom headers in LLM extraction - Allow arbitrary kwargs in AsyncWebCrawler initialization - Improve perform_completion_with_backoff for custom API calls - Update examples with new features and diverse LLM providers --- CHANGELOG.md | 33 ++++++++++++++++++++++++++++-- crawl4ai/async_crawler_strategy.py | 12 ++++++++--- crawl4ai/async_webcrawler.py | 6 +++--- crawl4ai/extraction_strategy.py | 9 +++++++- crawl4ai/utils.py | 12 ++++++++++- docs/examples/quickstart_async.py | 16 +++++++++++++-- 6 files changed, 76 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 873af87f..197fa32b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,36 @@ # Changelog -## [v0.3.6] - 2024-10-12 +## [v0.3.6] - 2024-10-12 - Part 1 + +### 1. Improved Crawling Control +- **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`. +- **Delayed HTML Retrieval**: Introduced `delay_before_return_html` parameter to allow waiting before retrieving HTML content. + - Useful for pages with delayed content loading. +- **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout. + - Provides better handling for slow-loading pages. + +### 2. Enhanced LLM Extraction Strategy +- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter. +- **Custom Headers**: Users can now pass custom headers to the extraction strategy. + - Enables more flexibility when interacting with different LLM APIs. + +### 3. AsyncWebCrawler Improvements +- **Flexible Initialization**: `AsyncWebCrawler` now accepts arbitrary keyword arguments. + - These are passed directly to the crawler strategy, allowing for more customized setups. + +### 4. Utility Function Enhancements +- **Improved API Interaction**: `perform_completion_with_backoff` function now supports additional arguments. + - Allows for more customized API calls to LLM providers. + +## Examples and Documentation +- Updated `quickstart_async.py` with examples of using custom headers in LLM extraction. +- Added more diverse examples of LLM provider usage, including OpenAI, Hugging Face, and Ollama. + +## Developer Notes +- Refactored code for better maintainability and flexibility. +- Enhanced error handling and logging for improved debugging experience. + +## [v0.3.6] - 2024-10-12 - Part 2 ### 1. Screenshot Capture - **What's new**: Added ability to capture screenshots during crawling. @@ -45,7 +75,6 @@ - Added examples for using different LLM providers in `quickstart_async.py`. - Enhanced type hinting throughout the codebase for better development experience. -We're constantly working to improve crawl4ai. These updates aim to provide you with more control, flexibility, and reliability in your web crawling tasks. As always, we appreciate your feedback and suggestions for future improvements! ## [v0.3.5] - 2024-09-02 diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 28795a3e..c74aff13 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -63,7 +63,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): 'on_execution_started': None, 'before_goto': None, 'after_goto': None, - 'before_return_html': None + 'before_return_html': None, + 'before_retrieve_html': None } async def __aenter__(self): @@ -295,7 +296,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): wait_for = kwargs.get("wait_for") if wait_for: try: - await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000)) + await self.smart_wait(page, wait_for, timeout=kwargs.get("page_timeout", 60000)) except Exception as e: raise RuntimeError(f"Wait condition failed: {str(e)}") @@ -304,8 +305,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("screenshot"): screenshot_data = await self.take_screenshot(url) + await self.execute_hook('before_retrieve_html', page) + # Check if delay_before_return_html is set then wait for that time + delay_before_return_html = kwargs.get("delay_before_return_html") + if delay_before_return_html: + await asyncio.sleep(delay_before_return_html) html = await page.content() - page = await self.execute_hook('before_return_html', page, html) + await self.execute_hook('before_return_html', page, html) if self.verbose: print(f"[LOG] ✅ Crawled {url} successfully!") diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d308e930..ba82d28f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -23,17 +23,17 @@ class AsyncWebCrawler: self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, always_by_pass_cache: bool = False, - verbose: bool = False, + **kwargs, ): self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( - verbose=verbose + **kwargs ) self.always_by_pass_cache = always_by_pass_cache self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") os.makedirs(self.crawl4ai_folder, exist_ok=True) os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) self.ready = False - self.verbose = verbose + self.verbose = kwargs.get("verbose", False) async def __aenter__(self): await self.crawler_strategy.__aenter__() diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 48491067..210a360b 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -80,6 +80,7 @@ class LLMExtractionStrategy(ExtractionStrategy): self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) self.apply_chunking = kwargs.get("apply_chunking", True) self.base_url = kwargs.get("base_url", None) + self.extra_args = kwargs.get("extra_args", {}) if not self.apply_chunking: self.chunk_token_threshold = 1e9 @@ -111,7 +112,13 @@ class LLMExtractionStrategy(ExtractionStrategy): "{" + variable + "}", variable_values[variable] ) - response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema") + response = perform_completion_with_backoff( + self.provider, + prompt_with_variables, + self.api_token, + base_url=self.base_url, + extra_args = self.extra_args + ) # , json_response=self.extract_type == "schema") try: blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] blocks = json.loads(blocks) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 71a36aed..77671a20 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -775,7 +775,14 @@ def extract_xml_data(tags, string): return data # Function to perform the completion with exponential backoff -def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None): +def perform_completion_with_backoff( + provider, + prompt_with_variables, + api_token, + json_response = False, + base_url=None, + **kwargs + ): from litellm import completion from litellm.exceptions import RateLimitError max_attempts = 3 @@ -784,6 +791,9 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token, extra_args = {} if json_response: extra_args["response_format"] = { "type": "json_object" } + + if kwargs.get("extra_args"): + extra_args.update(kwargs["extra_args"]) for attempt in range(max_attempts): try: diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 836bdb1d..9f00e323 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -96,13 +96,17 @@ class OpenAIModelFee(BaseModel): ..., description="Fee for output token for the OpenAI model." ) -async def extract_structured_data_using_llm(provider: str, api_token: str = None): +async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): print(f"\n--- Extracting Structured Data with {provider} ---") if api_token is None and provider != "ollama": print(f"API token is required for {provider}. Skipping this example.") return + extra_args = {} + if extra_headers: + extra_args["extra_headers"] = extra_headers + async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url="https://openai.com/api/pricing/", @@ -115,6 +119,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. Do not miss any models in the entire content. One extracted model JSON format should look like this: {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""", + extra_args=extra_args ), bypass_cache=True, ) @@ -414,9 +419,16 @@ async def main(): # LLM extraction examples await extract_structured_data_using_llm() - await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("ollama/llama3.2") + + # You always can pass custom headers to the extraction strategy + custom_headers = { + "Authorization": "Bearer your-custom-token", + "X-Custom-Header": "Some-Value" + } + await extract_structured_data_using_llm(extra_headers=custom_headers) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() From b9bbd4237355afb3fcd6b8ebc407d8b61b84a21c Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 13 Oct 2024 14:37:45 +0800 Subject: [PATCH 09/20] Update Quickstart examples --- docs/examples/quickstart_async.py | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9f00e323..27a162e3 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -10,6 +10,7 @@ import time import json import os import re +from typing import Dict from bs4 import BeautifulSoup from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler @@ -18,6 +19,8 @@ from crawl4ai.extraction_strategy import ( LLMExtractionStrategy, ) +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + print("Crawl4AI: Advanced Web Crawling and Data Extraction") print("GitHub Repository: https://github.com/unclecode/crawl4ai") print("Twitter: @unclecode") @@ -30,7 +33,7 @@ async def simple_crawl(): result = await crawler.arun(url="https://www.nbcnews.com/business") print(result.markdown[:500]) # Print first 500 characters -async def js_and_css(): +async def simple_example_with_running_js_code(): print("\n--- Executing JavaScript and Using CSS Selectors ---") # New code to handle the wait_for parameter wait_for = """() => { @@ -47,12 +50,21 @@ async def js_and_css(): result = await crawler.arun( url="https://www.nbcnews.com/business", js_code=js_code, - # css_selector="article.tease-card", # wait_for=wait_for, bypass_cache=True, ) print(result.markdown[:500]) # Print first 500 characters +async def simple_example_with_css_selector(): + print("\n--- Using CSS Selectors ---") + async with AsyncWebCrawler(verbose=True) as crawler: + result = await crawler.arun( + url="https://www.nbcnews.com/business", + css_selector=".wide-tease-item__description", + bypass_cache=True, + ) + print(result.markdown[:500]) # Print first 500 characters + async def use_proxy(): print("\n--- Using a Proxy ---") print( @@ -66,7 +78,6 @@ async def use_proxy(): # ) # print(result.markdown[:500]) # Print first 500 characters - async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( @@ -413,8 +424,10 @@ async def speed_comparison(): async def main(): await simple_crawl() - await js_and_css() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() await use_proxy() + await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) await extract_structured_data_using_css_extractor() # LLM extraction examples From 320afdea64f92c9a5942e901f4a9016ea7ab13f1 Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 14 Oct 2024 21:03:28 +0800 Subject: [PATCH 10/20] feat: Enhance crawler flexibility and LLM extraction capabilities - Add browser type selection (Chromium, Firefox, WebKit) - Implement iframe content extraction - Improve image processing and dimension updates - Add custom headers support in AsyncPlaywrightCrawlerStrategy - Enhance delayed content retrieval with new parameter - Optimize HTML sanitization and Markdown conversion - Update examples in quickstart_async.py for new features --- .gitignore | 3 +- crawl4ai/async_crawler_strategy.py | 125 ++++++++++++++++++- crawl4ai/content_scrapping_strategy.py | 13 +- crawl4ai/prompts.py | 4 +- crawl4ai/utils.py | 160 ++++++++++++------------- crawl4ai/web_crawler.py | 1 + docs/examples/quickstart_async.py | 25 ++++ 7 files changed, 238 insertions(+), 93 deletions(-) diff --git a/.gitignore b/.gitignore index 8b8f014c..e5718a14 100644 --- a/.gitignore +++ b/.gitignore @@ -203,4 +203,5 @@ git_changes.py git_changes.md pypi_build.sh -.tests/ \ No newline at end of file +.tests/ +git_changes.py \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index c74aff13..e9699953 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -50,7 +50,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") self.proxy = kwargs.get("proxy") self.headless = kwargs.get("headless", True) - self.headers = {} + self.browser_type = kwargs.get("browser_type", "chromium") # New parameter + self.headers = kwargs.get("headers", {}) self.sessions = {} self.session_ttl = 1800 self.js_code = js_code @@ -80,7 +81,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.browser is None: browser_args = { "headless": self.headless, - # "headless": False, "args": [ "--disable-gpu", "--disable-dev-shm-usage", @@ -95,7 +95,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): browser_args["proxy"] = proxy_settings - self.browser = await self.playwright.chromium.launch(**browser_args) + # Select the appropriate browser based on the browser_type + if self.browser_type == "firefox": + self.browser = await self.playwright.firefox.launch(**browser_args) + elif self.browser_type == "webkit": + self.browser = await self.playwright.webkit.launch(**browser_args) + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + await self.execute_hook('on_browser_created', self.browser) async def close(self): @@ -145,7 +152,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for sid in expired_sessions: asyncio.create_task(self.kill_session(sid)) - async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): wait_for = wait_for.strip() @@ -209,6 +215,48 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Exception as e: raise RuntimeError(f"Error in wait condition: {str(e)}") + async def process_iframes(self, page): + # Find all iframes + iframes = await page.query_selector_all('iframe') + + for i, iframe in enumerate(iframes): + try: + # Add a unique identifier to the iframe + await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') + + # Get the frame associated with this iframe + frame = await iframe.content_frame() + + if frame: + # Wait for the frame to load + await frame.wait_for_load_state('load', timeout=30000) # 30 seconds timeout + + # Extract the content of the iframe's body + iframe_content = await frame.evaluate('() => document.body.innerHTML') + + # Generate a unique class name for this iframe + class_name = f'extracted-iframe-content-{i}' + + # Replace the iframe with a div containing the extracted content + _iframe = iframe_content.replace('`', '\\`') + await page.evaluate(f""" + () => {{ + const iframe = document.getElementById('iframe-{i}'); + const div = document.createElement('div'); + div.innerHTML = `{_iframe}`; + div.className = '{class_name}'; + iframe.replaceWith(div); + }} + """) + else: + print(f"Warning: Could not access content frame for iframe {i}") + except Exception as e: + print(f"Error processing iframe {i}: {str(e)}") + + # Return the page object + return page + + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: response_headers = {} status_code = None @@ -263,6 +311,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): status_code = 200 response_headers = {} + await page.wait_for_selector('body') await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") @@ -305,11 +354,78 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if kwargs.get("screenshot"): screenshot_data = await self.take_screenshot(url) + + # New code to update image dimensions + update_image_dimensions_js = """ + () => { + return new Promise((resolve) => { + const filterImage = (img) => { + // Filter out images that are too small + if (img.width < 100 && img.height < 100) return false; + + // Filter out images that are not visible + const rect = img.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return false; + + // Filter out images with certain class names (e.g., icons, thumbnails) + if (img.classList.contains('icon') || img.classList.contains('thumbnail')) return false; + + // Filter out images with certain patterns in their src (e.g., placeholder images) + if (img.src.includes('placeholder') || img.src.includes('icon')) return false; + + return true; + }; + + const images = Array.from(document.querySelectorAll('img')).filter(filterImage); + let imagesLeft = images.length; + + if (imagesLeft === 0) { + resolve(); + return; + } + + const checkImage = (img) => { + if (img.complete && img.naturalWidth !== 0) { + img.setAttribute('width', img.naturalWidth); + img.setAttribute('height', img.naturalHeight); + imagesLeft--; + if (imagesLeft === 0) resolve(); + } + }; + + images.forEach(img => { + checkImage(img); + if (!img.complete) { + img.onload = () => { + checkImage(img); + }; + img.onerror = () => { + imagesLeft--; + if (imagesLeft === 0) resolve(); + }; + } + }); + + // Fallback timeout of 5 seconds + setTimeout(() => resolve(), 5000); + }); + } + """ + await page.evaluate(update_image_dimensions_js) + + # Wait a bit for any onload events to complete + await page.wait_for_timeout(100) + + # Process iframes + if kwargs.get("process_iframes", False): + page = await self.process_iframes(page) + await self.execute_hook('before_retrieve_html', page) # Check if delay_before_return_html is set then wait for that time delay_before_return_html = kwargs.get("delay_before_return_html") if delay_before_return_html: await asyncio.sleep(delay_before_return_html) + html = await page.content() await self.execute_hook('before_return_html', page, html) @@ -398,7 +514,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error as e: raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}") - async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]: semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count()) semaphore = asyncio.Semaphore(semaphore_count) diff --git a/crawl4ai/content_scrapping_strategy.py b/crawl4ai/content_scrapping_strategy.py index afd75892..68f03412 100644 --- a/crawl4ai/content_scrapping_strategy.py +++ b/crawl4ai/content_scrapping_strategy.py @@ -16,8 +16,6 @@ from .utils import ( CustomHTML2Text ) - - class ContentScrappingStrategy(ABC): @abstractmethod def scrap(self, url: str, html: str, **kwargs) -> Dict[str, Any]: @@ -129,7 +127,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) image_format = os.path.splitext(img.get('src',''))[1].lower() # Remove . from format - image_format = image_format.strip('.') + image_format = image_format.strip('.').split('?')[0] score = 0 if height_value: if height_unit == 'px' and height_value > 150: @@ -158,6 +156,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): return None return { 'src': img.get('src', ''), + 'data-src': img.get('data-src', ''), 'alt': img.get('alt', ''), 'desc': find_closest_parent_with_useful_text(img), 'score': score, @@ -275,11 +274,14 @@ class WebScrappingStrategy(ContentScrappingStrategy): # Replace base64 data with empty string img['src'] = base64_pattern.sub('', src) cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') - cleaned_html = sanitize_html(cleaned_html) h = CustomHTML2Text() h.ignore_links = True - markdown = h.handle(cleaned_html) + h.body_width = 0 + try: + markdown = h.handle(cleaned_html) + except Exception as e: + markdown = h.handle(sanitize_html(cleaned_html)) markdown = markdown.replace(' ```', '```') try: @@ -288,6 +290,7 @@ class WebScrappingStrategy(ContentScrappingStrategy): print('Error extracting metadata:', str(e)) meta = {} + cleaned_html = sanitize_html(cleaned_html) return { 'markdown': markdown, 'cleaned_html': cleaned_html, diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index a55d6fca..7a963e6d 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -1,4 +1,4 @@ -PROMPT_EXTRACT_BLOCKS = """YHere is the URL of the webpage: +PROMPT_EXTRACT_BLOCKS = """Here is the URL of the webpage: {URL} And here is the cleaned HTML content of that webpage: @@ -79,7 +79,7 @@ To generate the JSON objects: 2. For each block: a. Assign it an index based on its order in the content. b. Analyze the content and generate ONE semantic tag that describe what the block is about. - c. Extract the text content, EXACTLY SAME AS GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. + c. Extract the text content, EXACTLY SAME AS THE GIVE DATA, clean it up if needed, and store it as a list of strings in the "content" field. 3. Ensure that the order of the JSON objects matches the order of the blocks as they appear in the original HTML content. diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 77671a20..efb5d79b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -131,7 +131,7 @@ def split_and_parse_json_objects(json_string): return parsed_objects, unparsed_segments def sanitize_html(html): - # Replace all weird and special characters with an empty string + # Replace all unwanted and special characters with an empty string sanitized_html = html # sanitized_html = re.sub(r'[^\w\s.,;:!?=\[\]{}()<>\/\\\-"]', '', html) @@ -301,7 +301,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, if tag.name != 'img': tag.attrs = {} - # Extract all img tgas inti [{src: '', alt: ''}] + # Extract all img tgas int0 [{src: '', alt: ''}] media = { 'images': [], 'videos': [], @@ -339,7 +339,7 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, img.decompose() - # Create a function that replace content of all"pre" tage with its inner text + # Create a function that replace content of all"pre" tag with its inner text def replace_pre_tags_with_text(node): for child in node.find_all('pre'): # set child inner html to its text @@ -502,7 +502,7 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: current_tag = tag while current_tag: current_tag = current_tag.parent - # Get the text content of the parent tag + # Get the text content from the parent tag if current_tag: text_content = current_tag.get_text(separator=' ',strip=True) # Check if the text content has at least word_count_threshold @@ -511,88 +511,88 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: return None def process_image(img, url, index, total_images): - #Check if an image has valid display and inside undesired html elements - def is_valid_image(img, parent, parent_classes): - style = img.get('style', '') - src = img.get('src', '') - classes_to_check = ['button', 'icon', 'logo'] - tags_to_check = ['button', 'input'] - return all([ - 'display:none' not in style, - src, - not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check), - parent.name not in tags_to_check - ]) + #Check if an image has valid display and inside undesired html elements + def is_valid_image(img, parent, parent_classes): + style = img.get('style', '') + src = img.get('src', '') + classes_to_check = ['button', 'icon', 'logo'] + tags_to_check = ['button', 'input'] + return all([ + 'display:none' not in style, + src, + not any(s in var for var in [src, img.get('alt', ''), *parent_classes] for s in classes_to_check), + parent.name not in tags_to_check + ]) - #Score an image for it's usefulness - def score_image_for_usefulness(img, base_url, index, images_count): - # Function to parse image height/width value and units - def parse_dimension(dimension): - if dimension: - match = re.match(r"(\d+)(\D*)", dimension) - if match: - number = int(match.group(1)) - unit = match.group(2) or 'px' # Default unit is 'px' if not specified - return number, unit - return None, None + #Score an image for it's usefulness + def score_image_for_usefulness(img, base_url, index, images_count): + # Function to parse image height/width value and units + def parse_dimension(dimension): + if dimension: + match = re.match(r"(\d+)(\D*)", dimension) + if match: + number = int(match.group(1)) + unit = match.group(2) or 'px' # Default unit is 'px' if not specified + return number, unit + return None, None - # Fetch image file metadata to extract size and extension - def fetch_image_file_size(img, base_url): - #If src is relative path construct full URL, if not it may be CDN URL - img_url = urljoin(base_url,img.get('src')) - try: - response = requests.head(img_url) - if response.status_code == 200: - return response.headers.get('Content-Length',None) - else: - print(f"Failed to retrieve file size for {img_url}") - return None - except InvalidSchema as e: + # Fetch image file metadata to extract size and extension + def fetch_image_file_size(img, base_url): + #If src is relative path construct full URL, if not it may be CDN URL + img_url = urljoin(base_url,img.get('src')) + try: + response = requests.head(img_url) + if response.status_code == 200: + return response.headers.get('Content-Length',None) + else: + print(f"Failed to retrieve file size for {img_url}") return None - finally: - return + except InvalidSchema as e: + return None + finally: + return - image_height = img.get('height') - height_value, height_unit = parse_dimension(image_height) - image_width = img.get('width') - width_value, width_unit = parse_dimension(image_width) - image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) - image_format = os.path.splitext(img.get('src',''))[1].lower() - # Remove . from format - image_format = image_format.strip('.') - score = 0 - if height_value: - if height_unit == 'px' and height_value > 150: - score += 1 - if height_unit in ['%','vh','vmin','vmax'] and height_value >30: - score += 1 - if width_value: - if width_unit == 'px' and width_value > 150: - score += 1 - if width_unit in ['%','vh','vmin','vmax'] and width_value >30: - score += 1 - if image_size > 10000: + image_height = img.get('height') + height_value, height_unit = parse_dimension(image_height) + image_width = img.get('width') + width_value, width_unit = parse_dimension(image_width) + image_size = 0 #int(fetch_image_file_size(img,base_url) or 0) + image_format = os.path.splitext(img.get('src',''))[1].lower() + # Remove . from format + image_format = image_format.strip('.') + score = 0 + if height_value: + if height_unit == 'px' and height_value > 150: score += 1 - if img.get('alt') != '': - score+=1 - if any(image_format==format for format in ['jpg','png','webp']): - score+=1 - if index/images_count<0.5: - score+=1 - return score + if height_unit in ['%','vh','vmin','vmax'] and height_value >30: + score += 1 + if width_value: + if width_unit == 'px' and width_value > 150: + score += 1 + if width_unit in ['%','vh','vmin','vmax'] and width_value >30: + score += 1 + if image_size > 10000: + score += 1 + if img.get('alt') != '': + score+=1 + if any(image_format==format for format in ['jpg','png','webp']): + score+=1 + if index/images_count<0.5: + score+=1 + return score - if not is_valid_image(img, img.parent, img.parent.get('class', [])): - return None - score = score_image_for_usefulness(img, url, index, total_images) - if score <= IMAGE_SCORE_THRESHOLD: - return None - return { - 'src': img.get('src', ''), - 'alt': img.get('alt', ''), - 'desc': find_closest_parent_with_useful_text(img), - 'score': score, - 'type': 'image' - } + if not is_valid_image(img, img.parent, img.parent.get('class', [])): + return None + score = score_image_for_usefulness(img, url, index, total_images) + if score <= IMAGE_SCORE_THRESHOLD: + return None + return { + 'src': img.get('src', '').replace('\\"', '"').strip(), + 'alt': img.get('alt', ''), + 'desc': find_closest_parent_with_useful_text(img), + 'score': score, + 'type': 'image' + } def process_element(element: element.PageElement) -> bool: try: diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 7dea56ca..20e9b04e 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -12,6 +12,7 @@ from typing import List from concurrent.futures import ThreadPoolExecutor from .config import * import warnings +import json warnings.filterwarnings("ignore", message='Field "model_name" has conflict with protected namespace "model_".') diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 27a162e3..f6c16a4e 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -357,6 +357,28 @@ async def crawl_dynamic_content_pages_method_3(): await crawler.crawler_strategy.kill_session(session_id) print(f"Successfully crawled {len(all_commits)} commits across 3 pages") +async def crawl_custom_browser_type(): + # Use Firefox + start = time.time() + async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + + # Use WebKit + start = time.time() + async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + + # Use Chromium (default) + start = time.time() + async with AsyncWebCrawler(verbose=True, headless = True) as crawler: + result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + print(result.markdown[:500]) + print("Time taken: ", time.time() - start) + async def speed_comparison(): # print("\n--- Speed Comparison ---") # print("Firecrawl (simulated):") @@ -446,6 +468,9 @@ async def main(): # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() await crawl_dynamic_content_pages_method_3() + + await crawl_custom_browser_type() + await speed_comparison() From 6aa803d712a44c2144bb61a515b09e2815d1ac6d Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 14 Oct 2024 21:03:40 +0800 Subject: [PATCH 11/20] Update gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e5718a14..c6ac6610 100644 --- a/.gitignore +++ b/.gitignore @@ -204,4 +204,5 @@ git_changes.md pypi_build.sh .tests/ -git_changes.py \ No newline at end of file +git_changes.py +git_changes.md \ No newline at end of file From 2b73bdf6b09585fc52bf20b4e88f9eae8159135d Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 14 Oct 2024 21:04:02 +0800 Subject: [PATCH 12/20] Update changelog --- CHANGELOG.md | 114 ++++++++++++++++++++++++++------------------------- 1 file changed, 58 insertions(+), 56 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 197fa32b..a377d794 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## [v0.3.6] - 2024-10-12 - Part 1 +## [v0.3.6] - 2024-10-12 ### 1. Improved Crawling Control - **New Hook**: Added `before_retrieve_html` hook in `AsyncPlaywrightCrawlerStrategy`. @@ -8,73 +8,75 @@ - Useful for pages with delayed content loading. - **Flexible Timeout**: `smart_wait` function now uses `page_timeout` (default 60 seconds) instead of a fixed 30-second timeout. - Provides better handling for slow-loading pages. - -### 2. Enhanced LLM Extraction Strategy -- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter. -- **Custom Headers**: Users can now pass custom headers to the extraction strategy. - - Enables more flexibility when interacting with different LLM APIs. - -### 3. AsyncWebCrawler Improvements -- **Flexible Initialization**: `AsyncWebCrawler` now accepts arbitrary keyword arguments. - - These are passed directly to the crawler strategy, allowing for more customized setups. - -### 4. Utility Function Enhancements -- **Improved API Interaction**: `perform_completion_with_backoff` function now supports additional arguments. - - Allows for more customized API calls to LLM providers. - -## Examples and Documentation -- Updated `quickstart_async.py` with examples of using custom headers in LLM extraction. -- Added more diverse examples of LLM provider usage, including OpenAI, Hugging Face, and Ollama. - -## Developer Notes -- Refactored code for better maintainability and flexibility. -- Enhanced error handling and logging for improved debugging experience. - -## [v0.3.6] - 2024-10-12 - Part 2 - -### 1. Screenshot Capture -- **What's new**: Added ability to capture screenshots during crawling. -- **Why it matters**: You can now visually verify the content of crawled pages, which is useful for debugging and content verification. -- **How to use**: Set `screenshot=True` when calling `crawler.arun()`. - -### 2. Delayed Content Retrieval -- **What's new**: Introduced `get_delayed_content` method in `AsyncCrawlResponse`. -- **Why it matters**: Allows you to retrieve content after a specified delay, useful for pages that load content dynamically. -- **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. - -### 3. Custom Page Timeout -- **What's new**: Added `page_timeout` parameter to control page load timeout. -- **Why it matters**: Gives you more control over crawling behavior, especially for slow-loading pages. - **How to use**: Set `page_timeout=your_desired_timeout` (in milliseconds) when calling `crawler.arun()`. -### 4. Enhanced LLM Support -- **What's new**: Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama). -- **Why it matters**: Provides more flexibility in choosing AI models for content extraction. -- **How to use**: Specify the desired provider when using `LLMExtractionStrategy`. +### 2. Browser Type Selection +- Added support for different browser types (Chromium, Firefox, WebKit). +- Users can now specify the browser type when initializing AsyncWebCrawler. +- **How to use**: Set `browser_type="firefox"` or `browser_type="webkit"` when initializing AsyncWebCrawler. -## Improvements +### 3. Screenshot Capture +- Added ability to capture screenshots during crawling. +- Useful for debugging and content verification. +- **How to use**: Set `screenshot=True` when calling `crawler.arun()`. -### 1. Database Schema Auto-updates -- **What's new**: Automatic database schema updates. -- **Why it matters**: Ensures your database stays compatible with the latest version without manual intervention. +### 4. Enhanced LLM Extraction Strategy +- Added support for multiple LLM providers (OpenAI, Hugging Face, Ollama). +- **Custom Arguments**: Added support for passing extra arguments to LLM providers via `extra_args` parameter. +- **Custom Headers**: Users can now pass custom headers to the extraction strategy. +- **How to use**: Specify the desired provider and custom arguments when using `LLMExtractionStrategy`. -### 2. Enhanced Error Handling -- **What's new**: Improved error messages and logging. -- **Why it matters**: Makes debugging easier with more informative error messages. +### 5. iframe Content Extraction +- New feature to process and extract content from iframes. +- **How to use**: Set `process_iframes=True` in the crawl method. -### 3. Optimized Image Processing -- **What's new**: Refined image handling in `WebScrappingStrategy`. -- **Why it matters**: Improves the accuracy of content extraction for pages with images. +### 6. Delayed Content Retrieval +- Introduced `get_delayed_content` method in `AsyncCrawlResponse`. +- Allows retrieval of content after a specified delay, useful for dynamically loaded content. +- **How to use**: Access `result.get_delayed_content(delay_in_seconds)` after crawling. + +## Improvements and Optimizations + +### 1. AsyncWebCrawler Enhancements +- **Flexible Initialization**: Now accepts arbitrary keyword arguments, passed directly to the crawler strategy. +- Allows for more customized setups. + +### 2. Image Processing Optimization +- Enhanced image handling in WebScrappingStrategy. +- Added filtering for small, invisible, or irrelevant images. +- Improved image scoring system for better content relevance. +- Implemented JavaScript-based image dimension updating for more accurate representation. + +### 3. Database Schema Auto-updates +- Automatic database schema updates ensure compatibility with the latest version. + +### 4. Enhanced Error Handling and Logging +- Improved error messages and logging for easier debugging. + +### 5. Content Extraction Refinements +- Refined HTML sanitization process. +- Improved handling of base64 encoded images. +- Enhanced Markdown conversion process. +- Optimized content extraction algorithms. + +### 6. Utility Function Enhancements +- `perform_completion_with_backoff` function now supports additional arguments for more customized API calls to LLM providers. ## Bug Fixes - - Fixed an issue where image tags were being prematurely removed during content extraction. +## Examples and Documentation +- Updated `quickstart_async.py` with examples of: + - Using custom headers in LLM extraction. + - Different LLM provider usage (OpenAI, Hugging Face, Ollama). + - Custom browser type usage. + ## Developer Notes +- Refactored code for better maintainability, flexibility, and performance. +- Enhanced type hinting throughout the codebase for improved development experience. +- Expanded error handling for more robust operation. -- Added examples for using different LLM providers in `quickstart_async.py`. -- Enhanced type hinting throughout the codebase for better development experience. - +These updates significantly enhance the flexibility, accuracy, and robustness of crawl4ai, providing users with more control and options for their web crawling and content extraction tasks. ## [v0.3.5] - 2024-09-02 From d06535388a5bbf5872a6253cf61a4a2bda9fc54e Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 14 Oct 2024 22:53:56 +0800 Subject: [PATCH 13/20] Update gitignore --- .gitignore | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index b48005ba..f9fc7911 100644 --- a/.gitignore +++ b/.gitignore @@ -196,4 +196,8 @@ docs/.DS_Store tmp/ test_env/ **/.DS_Store -**/.DS_Store \ No newline at end of file +**/.DS_Store + +git_changes.md +git_changes.py +todo.md \ No newline at end of file From 9ffa34b6978a64ad7ab11bdd4112ea159932e002 Mon Sep 17 00:00:00 2001 From: unclecode Date: Mon, 14 Oct 2024 22:58:27 +0800 Subject: [PATCH 14/20] Update README --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 9fb7776c..a8c2b9b0 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,14 @@ Crawl4AI simplifies asynchronous web crawling and data extraction, making it acc > Looking for the synchronous version? Check out [README.sync.md](./README.sync.md). You can also access the previous version in the branch [V0.2.76](https://github.com/unclecode/crawl4ai/blob/v0.2.76). +## New update 0.3.6 +- 🌐 Multi-browser support (Chromium, Firefox, WebKit) +- 🖼️ Improved image processing with lazy-loading detection +- 🔧 Custom page timeout parameter for better control over crawling behavior +- 🕰️ Enhanced handling of delayed content loading +- 🔑 Custom headers support for LLM interactions +- 🖼️ iframe content extraction for comprehensive page analysis +- ⏱️ Flexible timeout and delayed content retrieval options ## Try it Now! From 8a7d29ce85056a51f03049b37c51d83d5304743c Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 15:59:37 +0530 Subject: [PATCH 15/20] updated some comments and removed content type checking functionality from core as it's implemented as a filter --- crawl4ai/scraper/bfs_scraper_strategy.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index dc89047a..9022cd90 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -24,18 +24,17 @@ class BFSScraperStrategy(ScraperStrategy): self.filter_chain = filter_chain self.url_scorer = url_scorer self.max_concurrent = max_concurrent - # 9. Crawl Politeness + # For Crawl Politeness self.last_crawl_time = defaultdict(float) self.min_crawl_delay = 1 # 1 second delay between requests to the same domain - # 5. Robots.txt Compliance + # For Robots.txt Compliance self.robot_parsers = {} - + # Robots.txt Parser def get_robot_parser(self, url: str) -> RobotFileParser: domain = urlparse(url) scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided netloc = domain.netloc - if netloc not in self.robot_parsers: rp = RobotFileParser() rp.set_url(f"{scheme}://{netloc}/robots.txt") @@ -90,11 +89,6 @@ class BFSScraperStrategy(ScraperStrategy): await self.add_to_retry_queue(url) return crawl_result - # Content Type Checking - # if 'text/html' not in crawl_result.response_header.get('Content-Type', ''): - # logging.info(f"Skipping non-HTML content: {url}") - # return crawl_result - visited.add(url) # Process links From 2943feeecf44806a000f5c01502798e52278bce7 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 22:05:29 +0530 Subject: [PATCH 16/20] 1. Added a flag to yield each crawl result,as they become ready along with the final scraper result as another option 2. Removed ascrape_many method, as I'm currently not focusing on it in the first cut of scraper 3. Added some error handling for cases where robots.txt cannot be fetched or parsed. --- crawl4ai/scraper/async_web_scraper.py | 48 ++++++++++++------------ crawl4ai/scraper/bfs_scraper_strategy.py | 33 +++++++++------- crawl4ai/scraper/scraper_strategy.py | 21 ++++++++++- 3 files changed, 62 insertions(+), 40 deletions(-) diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index fadfa61f..811aeacc 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -1,35 +1,35 @@ import asyncio from typing import List, Dict from .scraper_strategy import ScraperStrategy -from .bfs_scraper_strategy import BFSScraperStrategy -from .models import ScraperResult +from .models import ScraperResult, CrawlResult from ..async_webcrawler import AsyncWebCrawler - -class BatchProcessor: - def __init__(self, batch_size: int, concurrency_limit: int): - self.batch_size = batch_size - self.concurrency_limit = concurrency_limit - - async def process_batch(self, scraper: 'AsyncWebScraper', urls: List[str]) -> List[ScraperResult]: - semaphore = asyncio.Semaphore(self.concurrency_limit) - async def scrape_with_semaphore(url): - async with semaphore: - return await scraper.ascrape(url) - return await asyncio.gather(*[scrape_with_semaphore(url) for url in urls]) +from typing import Union, AsyncGenerator class AsyncWebScraper: def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5): self.crawler = crawler self.strategy = strategy - self.batch_processor = BatchProcessor(batch_size, concurrency_limit) - async def ascrape(self, url: str, parallel_processing: bool = True) -> ScraperResult: - return await self.strategy.ascrape(url, self.crawler, parallel_processing) + async def ascrape(self, url: str, parallel_processing: bool = True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + if yield_results: + return self._ascrape_yielding(url, parallel_processing) + else: + return await self._ascrape_collecting(url, parallel_processing) - async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: - all_results = [] - for i in range(0, len(urls), self.batch_processor.batch_size): - batch = urls[i:i+self.batch_processor.batch_size] - batch_results = await self.batch_processor.process_batch(self, batch) - all_results.extend(batch_results) - return all_results \ No newline at end of file + async def _ascrape_yielding(self, url: str, parallel_processing: bool) -> AsyncGenerator[CrawlResult, None]: + result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing) + async for res in result_generator: # Consume the async generator + yield res # Yielding individual results + + async def _ascrape_collecting(self, url: str, parallel_processing: bool) -> ScraperResult: + extracted_data = {} + result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing) + async for res in result_generator: # Consume the async generator + extracted_data[res.url] = res + + # Return a final ScraperResult + return ScraperResult( + url=url, + crawled_urls=list(extracted_data.keys()), + extracted_data=extracted_data + ) \ No newline at end of file diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 9022cd90..1146714d 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -1,7 +1,6 @@ from .scraper_strategy import ScraperStrategy from .filters import FilterChain from .scorers import URLScorer -from .models import ScraperResult from ..models import CrawlResult from ..async_webcrawler import AsyncWebCrawler import asyncio @@ -13,7 +12,7 @@ from aiolimiter import AsyncLimiter from tenacity import retry, stop_after_attempt, wait_exponential from collections import defaultdict import logging -from typing import Dict +from typing import Dict, AsyncGenerator logging.basicConfig(level=logging.DEBUG) rate_limiter = AsyncLimiter(1, 1) # 1 request per second @@ -38,7 +37,12 @@ class BFSScraperStrategy(ScraperStrategy): if netloc not in self.robot_parsers: rp = RobotFileParser() rp.set_url(f"{scheme}://{netloc}/robots.txt") - rp.read() + try: + rp.read() + except Exception as e: + # Log the type of error, message, and the URL + logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}") + return None self.robot_parsers[netloc] = rp return self.robot_parsers[netloc] @@ -48,7 +52,7 @@ class BFSScraperStrategy(ScraperStrategy): async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult: return await crawler.arun(url) - async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> CrawlResult: + async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]: def normalize_url(url: str) -> str: parsed = urlparse(url) return urlunparse(parsed._replace(fragment="")) @@ -59,9 +63,14 @@ class BFSScraperStrategy(ScraperStrategy): return None # Robots.txt Compliance - if not self.get_robot_parser(url).can_fetch(crawler.crawler_strategy.user_agent, url): - logging.info(f"Skipping {url} as per robots.txt") - return None + robot_parser = self.get_robot_parser(url) + if robot_parser is None: + logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.") + else: + # If robots.txt was fetched, check if crawling is allowed + if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url): + logging.info(f"Skipping {url} as per robots.txt") + return None # Crawl Politeness domain = urlparse(url).netloc @@ -103,14 +112,12 @@ class BFSScraperStrategy(ScraperStrategy): score = self.url_scorer.score(normalized_link) await queue.put((score, new_depth, normalized_link)) depths[normalized_link] = new_depth - return crawl_result - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> ScraperResult: + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> CrawlResult: queue = asyncio.PriorityQueue() queue.put_nowait((0, 0, start_url)) visited = set() - extracted_data = {} depths = {start_url: 0} while not queue.empty(): @@ -124,12 +131,10 @@ class BFSScraperStrategy(ScraperStrategy): else: result = await self.process_url(url, depth, crawler, queue, visited, depths) if result: - extracted_data[result.url] = result + yield result if parallel_processing and tasks: results = await asyncio.gather(*tasks) for result in results: if result: - extracted_data[result.url] = result - - return ScraperResult(url=start_url, crawled_urls=list(visited), extracted_data=extracted_data) \ No newline at end of file + yield result \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index 6d1cdc74..e08a980d 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -1,9 +1,26 @@ from abc import ABC, abstractmethod -from .models import ScraperResult +from .models import ScraperResult, CrawlResult from ..models import CrawlResult from ..async_webcrawler import AsyncWebCrawler +from typing import Union, AsyncGenerator class ScraperStrategy(ABC): @abstractmethod - async def ascrape(self, url: str, crawler: AsyncWebCrawler) -> ScraperResult: + async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool=True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + """Scrape the given URL using the specified crawler. + + Args: + url (str): The starting URL for the scrape. + crawler (AsyncWebCrawler): The web crawler instance. + parallel_processing (bool): Whether to use parallel processing. Defaults to True. + yield_results (bool): If True, yields individual crawl results as they are ready; + if False, accumulates results and returns a final ScraperResult. + + Yields: + CrawlResult: Individual crawl results if yield_results is True. + + Returns: + ScraperResult: A summary of the scrape results containing the final extracted data + and the list of crawled URLs if yield_results is False. + """ pass \ No newline at end of file From 04d8b47b927a5b3ba73e156e99292b76631c9c34 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 22:34:54 +0530 Subject: [PATCH 17/20] Exposed min_crawl_delay for BFSScraperStrategy --- crawl4ai/scraper/bfs_scraper_strategy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 1146714d..6fc39e73 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -18,14 +18,14 @@ logging.basicConfig(level=logging.DEBUG) rate_limiter = AsyncLimiter(1, 1) # 1 request per second class BFSScraperStrategy(ScraperStrategy): - def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5): + def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1): self.max_depth = max_depth self.filter_chain = filter_chain self.url_scorer = url_scorer self.max_concurrent = max_concurrent # For Crawl Politeness self.last_crawl_time = defaultdict(float) - self.min_crawl_delay = 1 # 1 second delay between requests to the same domain + self.min_crawl_delay = min_crawl_delay # 1 second delay between requests to the same domain # For Robots.txt Compliance self.robot_parsers = {} From de28b59aca473b3292ecc7f6ab1f60dbd3ed488a Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Wed, 16 Oct 2024 22:36:48 +0530 Subject: [PATCH 18/20] removed unused imports --- crawl4ai/scraper/async_web_scraper.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index 811aeacc..0d921af5 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -1,5 +1,3 @@ -import asyncio -from typing import List, Dict from .scraper_strategy import ScraperStrategy from .models import ScraperResult, CrawlResult from ..async_webcrawler import AsyncWebCrawler From ce7fce4b1648761b90ad95cc699b2e13abe19be2 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 17 Oct 2024 12:25:17 +0530 Subject: [PATCH 19/20] 1. Moved to asyncio.wait instead of gather so that results can be yeilded just as they are ready, rather than in batches 2. Moved the visted.add(url), to before the task is put in queue rather than after the crawl is completed. This makes sure that duplicate crawls doesn't happen when same URL is found at different depth and that get's queued too because the crawl is not yet completed and visted set is not updated. 3. Named the yield_results attribute to stream instead. Since that seems to be popularly used in all other AI libraries for intermediate results. --- crawl4ai/scraper/async_web_scraper.py | 6 +++--- crawl4ai/scraper/bfs_scraper_strategy.py | 26 ++++++++++++++---------- crawl4ai/scraper/scraper_strategy.py | 8 ++++---- 3 files changed, 22 insertions(+), 18 deletions(-) diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py index 0d921af5..2fd919e1 100644 --- a/crawl4ai/scraper/async_web_scraper.py +++ b/crawl4ai/scraper/async_web_scraper.py @@ -4,12 +4,12 @@ from ..async_webcrawler import AsyncWebCrawler from typing import Union, AsyncGenerator class AsyncWebScraper: - def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5): + def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy): self.crawler = crawler self.strategy = strategy - async def ascrape(self, url: str, parallel_processing: bool = True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: - if yield_results: + async def ascrape(self, url: str, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + if stream: return self._ascrape_yielding(url, parallel_processing) else: return await self._ascrape_collecting(url, parallel_processing) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index 6fc39e73..b6cdaa80 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -71,7 +71,7 @@ class BFSScraperStrategy(ScraperStrategy): if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url): logging.info(f"Skipping {url} as per robots.txt") return None - + # Crawl Politeness domain = urlparse(url).netloc time_since_last_crawl = time.time() - self.last_crawl_time[domain] @@ -97,8 +97,6 @@ class BFSScraperStrategy(ScraperStrategy): elif crawl_result.status_code == 503: await self.add_to_retry_queue(url) return crawl_result - - visited.add(url) # Process links for link_type in ["internal", "external"]: @@ -114,27 +112,33 @@ class BFSScraperStrategy(ScraperStrategy): depths[normalized_link] = new_depth return crawl_result - async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> CrawlResult: + async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]: queue = asyncio.PriorityQueue() queue.put_nowait((0, 0, start_url)) visited = set() depths = {start_url: 0} + pending_tasks = set() - while not queue.empty(): - tasks = [] - while not queue.empty() and len(tasks) < self.max_concurrent: + while not queue.empty() or pending_tasks: + while not queue.empty() and len(pending_tasks) < self.max_concurrent: _, depth, url = await queue.get() if url not in visited: + # Adding URL to the visited set here itself, (instead of after result generation) + # so that other tasks are not queued for same URL, found at different depth before + # crawling and extraction of this task is completed. + visited.add(url) if parallel_processing: task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths)) - tasks.append(task) + pending_tasks.add(task) else: result = await self.process_url(url, depth, crawler, queue, visited, depths) if result: yield result - if parallel_processing and tasks: - results = await asyncio.gather(*tasks) - for result in results: + # Wait for the first task to complete and yield results incrementally as each task is completed + if pending_tasks: + done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED) + for task in done: + result = await task if result: yield result \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py index e08a980d..e4872de7 100644 --- a/crawl4ai/scraper/scraper_strategy.py +++ b/crawl4ai/scraper/scraper_strategy.py @@ -6,21 +6,21 @@ from typing import Union, AsyncGenerator class ScraperStrategy(ABC): @abstractmethod - async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool=True, yield_results: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: + async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]: """Scrape the given URL using the specified crawler. Args: url (str): The starting URL for the scrape. crawler (AsyncWebCrawler): The web crawler instance. parallel_processing (bool): Whether to use parallel processing. Defaults to True. - yield_results (bool): If True, yields individual crawl results as they are ready; + stream (bool): If True, yields individual crawl results as they are ready; if False, accumulates results and returns a final ScraperResult. Yields: - CrawlResult: Individual crawl results if yield_results is True. + CrawlResult: Individual crawl results if stream is True. Returns: ScraperResult: A summary of the scrape results containing the final extracted data - and the list of crawled URLs if yield_results is False. + and the list of crawled URLs if stream is False. """ pass \ No newline at end of file From 8105fd178e1b7b00a4628e2227953fbe418af5c4 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Thu, 17 Oct 2024 15:42:43 +0530 Subject: [PATCH 20/20] Removed stubs for remove_from_future_crawls since the visited set is updated soon as the URL was queued, Removed add_to_retry_queue(url) since retry with exponential backoff with help of tenacity is going to take care of it. --- crawl4ai/scraper/bfs_scraper_strategy.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index b6cdaa80..ce4d0127 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -91,11 +91,6 @@ class BFSScraperStrategy(ScraperStrategy): if not crawl_result.success: # Logging and Monitoring logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}") - # Error Categorization - if crawl_result.status_code == 404: - self.remove_from_future_crawls(url) - elif crawl_result.status_code == 503: - await self.add_to_retry_queue(url) return crawl_result # Process links