From 44ce12c62c5c02421cd760c89d8ffda9dd59c208 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Mon, 9 Sep 2024 13:13:34 +0530 Subject: [PATCH] Created scaffolding for Scraper as per the plan. Implemented the ascrape method in bfs_scraper_strategy --- crawl4ai/scraper/__init__.py | 0 crawl4ai/scraper/async_web_scraper.py | 36 +++++++++++++ crawl4ai/scraper/bfs_scraper_strategy.py | 50 +++++++++++++++++++ crawl4ai/scraper/filters/__init__.py | 3 ++ .../scraper/filters/content_type_filter.py | 8 +++ crawl4ai/scraper/filters/url_filter.py | 16 ++++++ .../scraper/filters/url_pattern_filter.py | 9 ++++ crawl4ai/scraper/models.py | 7 +++ crawl4ai/scraper/scorers/__init__.py | 2 + .../scorers/keyword_relevance_scorer.py | 9 ++++ crawl4ai/scraper/scorers/url_scorer.py | 6 +++ crawl4ai/scraper/scraper_strategy.py | 9 ++++ 12 files changed, 155 insertions(+) create mode 100644 crawl4ai/scraper/__init__.py create mode 100644 crawl4ai/scraper/async_web_scraper.py create mode 100644 crawl4ai/scraper/bfs_scraper_strategy.py create mode 100644 crawl4ai/scraper/filters/__init__.py create mode 100644 crawl4ai/scraper/filters/content_type_filter.py create mode 100644 crawl4ai/scraper/filters/url_filter.py create mode 100644 crawl4ai/scraper/filters/url_pattern_filter.py create mode 100644 crawl4ai/scraper/models.py create mode 100644 crawl4ai/scraper/scorers/__init__.py create mode 100644 crawl4ai/scraper/scorers/keyword_relevance_scorer.py create mode 100644 crawl4ai/scraper/scorers/url_scorer.py create mode 100644 crawl4ai/scraper/scraper_strategy.py diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py new file mode 100644 index 00000000..c67f0e14 --- /dev/null +++ b/crawl4ai/scraper/async_web_scraper.py @@ -0,0 +1,36 @@ +import asyncio +from typing import List, Dict +from .scraper_strategy import ScraperStrategy +from .bfs_scraper_strategy import BFSScraperStrategy +from .models import ScraperResult +from ..async_webcrawler import AsyncWebCrawler + +class BatchProcessor: + def __init__(self, batch_size: int, concurrency_limit: int): + self.batch_size = batch_size + self.concurrency_limit = concurrency_limit + + async def process_batch(self, scraper: 'AsyncWebScraper', urls: List[str]) -> List[ScraperResult]: + semaphore = asyncio.Semaphore(self.concurrency_limit) + async def scrape_with_semaphore(url): + async with semaphore: + return await scraper.ascrape(url) + return await asyncio.gather(*[scrape_with_semaphore(url) for url in urls]) + +class AsyncWebScraper: + def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5): + self.crawler = crawler + self.strategy = strategy + self.batch_processor = BatchProcessor(batch_size, concurrency_limit) + + async def ascrape(self, url: str) -> ScraperResult: + crawl_result = await self.crawler.arun(url) + return await self.strategy.ascrape(url, crawl_result, self.crawler) + + async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]: + all_results = [] + for i in range(0, len(urls), self.batch_processor.batch_size): + batch = urls[i:i+self.batch_processor.batch_size] + batch_results = await self.batch_processor.process_batch(self, batch) + all_results.extend(batch_results) + return all_results \ No newline at end of file diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py new file mode 100644 index 00000000..9add962e --- /dev/null +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -0,0 +1,50 @@ +from .scraper_strategy import ScraperStrategy +from .filters import FilterChain +from .scorers import URLScorer +from .models import ScraperResult +from ..models import CrawlResult +from ..async_webcrawler import AsyncWebCrawler +import asyncio +from urllib.parse import urljoin + +class BFSScraperStrategy(ScraperStrategy): + def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer): + self.max_depth = max_depth + self.filter_chain = filter_chain + self.url_scorer = url_scorer + + async def ascrape(self, start_url: str, initial_crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + queue = asyncio.PriorityQueue() + queue.put_nowait((0, 0, start_url)) # (score, depth, url) + visited = set() + crawled_urls = [] + extracted_data = {} + + while not queue.empty(): + _, depth, url = await queue.get() + if depth > self.max_depth or url in visited: + continue + crawl_result = initial_crawl_result if url == start_url else await crawler.arun(url) + visited.add(url) + crawled_urls.append(url) + extracted_data[url]=crawl_result + if crawl_result.success == False: + print(f"failed to crawl -- {url}") + continue + for internal in crawl_result.links["internal"]: + link = internal['href'] + is_special_uri = any(link.startswith(scheme) for scheme in ('tel:', 'mailto:', 'sms:', 'geo:', 'fax:', 'file:', 'data:', 'sip:', 'ircs:', 'magnet:')) + is_fragment = '#' in link + if not (is_fragment or is_special_uri): + # To fix partial links: eg:'/support' to 'https://example.com/support' + absolute_link = urljoin(url, link) + if self.filter_chain.apply(absolute_link) and absolute_link not in visited: + score = self.url_scorer.score(absolute_link) + await queue.put((1 / score, depth + 1, absolute_link)) + for external in crawl_result.links["external"]: + link = external['href'] + if self.filter_chain.apply(link) and link not in visited: + score = self.url_scorer.score(link) + await queue.put((1 / score, depth + 1, link)) + + return ScraperResult(url=start_url, crawled_urls=crawled_urls, extracted_data=extracted_data) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/__init__.py b/crawl4ai/scraper/filters/__init__.py new file mode 100644 index 00000000..525c9bdb --- /dev/null +++ b/crawl4ai/scraper/filters/__init__.py @@ -0,0 +1,3 @@ +from .url_filter import URLFilter, FilterChain +from .content_type_filter import ContentTypeFilter +from .url_pattern_filter import URLPatternFilter \ No newline at end of file diff --git a/crawl4ai/scraper/filters/content_type_filter.py b/crawl4ai/scraper/filters/content_type_filter.py new file mode 100644 index 00000000..9173eb4a --- /dev/null +++ b/crawl4ai/scraper/filters/content_type_filter.py @@ -0,0 +1,8 @@ +from .url_filter import URLFilter + +class ContentTypeFilter(URLFilter): + def __init__(self, contentType: str): + self.contentType = contentType + def apply(self, url: str) -> bool: + #TODO: This is a stub. Will implement this later + return True \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_filter.py b/crawl4ai/scraper/filters/url_filter.py new file mode 100644 index 00000000..2b8bd6eb --- /dev/null +++ b/crawl4ai/scraper/filters/url_filter.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + +class URLFilter(ABC): + @abstractmethod + def apply(self, url: str) -> bool: + pass + +class FilterChain: + def __init__(self): + self.filters = [] + + def add_filter(self, filter: URLFilter): + self.filters.append(filter) + + def apply(self, url: str) -> bool: + return all(filter.apply(url) for filter in self.filters) \ No newline at end of file diff --git a/crawl4ai/scraper/filters/url_pattern_filter.py b/crawl4ai/scraper/filters/url_pattern_filter.py new file mode 100644 index 00000000..fd5df133 --- /dev/null +++ b/crawl4ai/scraper/filters/url_pattern_filter.py @@ -0,0 +1,9 @@ +from .url_filter import URLFilter +from re import Pattern + +class URLPatternFilter(URLFilter): + def __init__(self, pattern: Pattern): + self.pattern = pattern + def apply(self, url: str) -> bool: + #TODO: This is a stub. Will implement this later. + return True \ No newline at end of file diff --git a/crawl4ai/scraper/models.py b/crawl4ai/scraper/models.py new file mode 100644 index 00000000..9ffdac52 --- /dev/null +++ b/crawl4ai/scraper/models.py @@ -0,0 +1,7 @@ +from pydantic import BaseModel +from typing import List, Dict + +class ScraperResult(BaseModel): + url: str + crawled_urls: List[str] + extracted_data: Dict \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/__init__.py b/crawl4ai/scraper/scorers/__init__.py new file mode 100644 index 00000000..05c61c94 --- /dev/null +++ b/crawl4ai/scraper/scorers/__init__.py @@ -0,0 +1,2 @@ +from .url_scorer import URLScorer +from .keyword_relevance_scorer import KeywordRelevanceScorer \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py new file mode 100644 index 00000000..a2338aec --- /dev/null +++ b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py @@ -0,0 +1,9 @@ +from .url_scorer import URLScorer +from typing import List + +class KeywordRelevanceScorer(URLScorer): + def __init__(self,keywords: List[str]): + self.keyworkds = keywords + def score(self, url: str) -> float: + #TODO: This is a stub. Will implement this later. + return 1 \ No newline at end of file diff --git a/crawl4ai/scraper/scorers/url_scorer.py b/crawl4ai/scraper/scorers/url_scorer.py new file mode 100644 index 00000000..6ee9ab05 --- /dev/null +++ b/crawl4ai/scraper/scorers/url_scorer.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + +class URLScorer(ABC): + @abstractmethod + def score(self, url: str) -> float: + pass \ No newline at end of file diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py new file mode 100644 index 00000000..16df9ece --- /dev/null +++ b/crawl4ai/scraper/scraper_strategy.py @@ -0,0 +1,9 @@ +from abc import ABC, abstractmethod +from .models import ScraperResult +from ..models import CrawlResult +from ..async_webcrawler import AsyncWebCrawler + +class ScraperStrategy(ABC): + @abstractmethod + async def ascrape(self, url: str, crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult: + pass \ No newline at end of file