From 44ce12c62c5c02421cd760c89d8ffda9dd59c208 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Mon, 9 Sep 2024 13:13:34 +0530
Subject: [PATCH] Created scaffolding for Scraper as per the plan. Implemented
 the ascrape method in bfs_scraper_strategy

---
 crawl4ai/scraper/__init__.py                  |  0
 crawl4ai/scraper/async_web_scraper.py         | 36 +++++++++++++
 crawl4ai/scraper/bfs_scraper_strategy.py      | 50 +++++++++++++++++++
 crawl4ai/scraper/filters/__init__.py          |  3 ++
 .../scraper/filters/content_type_filter.py    |  8 +++
 crawl4ai/scraper/filters/url_filter.py        | 16 ++++++
 .../scraper/filters/url_pattern_filter.py     |  9 ++++
 crawl4ai/scraper/models.py                    |  7 +++
 crawl4ai/scraper/scorers/__init__.py          |  2 +
 .../scorers/keyword_relevance_scorer.py       |  9 ++++
 crawl4ai/scraper/scorers/url_scorer.py        |  6 +++
 crawl4ai/scraper/scraper_strategy.py          |  9 ++++
 12 files changed, 155 insertions(+)
 create mode 100644 crawl4ai/scraper/__init__.py
 create mode 100644 crawl4ai/scraper/async_web_scraper.py
 create mode 100644 crawl4ai/scraper/bfs_scraper_strategy.py
 create mode 100644 crawl4ai/scraper/filters/__init__.py
 create mode 100644 crawl4ai/scraper/filters/content_type_filter.py
 create mode 100644 crawl4ai/scraper/filters/url_filter.py
 create mode 100644 crawl4ai/scraper/filters/url_pattern_filter.py
 create mode 100644 crawl4ai/scraper/models.py
 create mode 100644 crawl4ai/scraper/scorers/__init__.py
 create mode 100644 crawl4ai/scraper/scorers/keyword_relevance_scorer.py
 create mode 100644 crawl4ai/scraper/scorers/url_scorer.py
 create mode 100644 crawl4ai/scraper/scraper_strategy.py

diff --git a/crawl4ai/scraper/__init__.py b/crawl4ai/scraper/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/crawl4ai/scraper/async_web_scraper.py b/crawl4ai/scraper/async_web_scraper.py
new file mode 100644
index 00000000..c67f0e14
--- /dev/null
+++ b/crawl4ai/scraper/async_web_scraper.py
@@ -0,0 +1,36 @@
+import asyncio
+from typing import List, Dict
+from .scraper_strategy import ScraperStrategy
+from .bfs_scraper_strategy import BFSScraperStrategy
+from .models import ScraperResult
+from ..async_webcrawler import AsyncWebCrawler
+
+class BatchProcessor:
+    def __init__(self, batch_size: int, concurrency_limit: int):
+        self.batch_size = batch_size
+        self.concurrency_limit = concurrency_limit
+
+    async def process_batch(self, scraper: 'AsyncWebScraper', urls: List[str]) -> List[ScraperResult]:
+        semaphore = asyncio.Semaphore(self.concurrency_limit)
+        async def scrape_with_semaphore(url):
+            async with semaphore:
+                return await scraper.ascrape(url)
+        return await asyncio.gather(*[scrape_with_semaphore(url) for url in urls])
+
+class AsyncWebScraper:
+    def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy, batch_size: int = 10, concurrency_limit: int = 5):
+        self.crawler = crawler
+        self.strategy = strategy
+        self.batch_processor = BatchProcessor(batch_size, concurrency_limit)
+
+    async def ascrape(self, url: str) -> ScraperResult:
+        crawl_result = await self.crawler.arun(url)
+        return await self.strategy.ascrape(url, crawl_result, self.crawler)
+
+    async def ascrape_many(self, urls: List[str]) -> List[ScraperResult]:
+        all_results = []
+        for i in range(0, len(urls), self.batch_processor.batch_size):
+            batch = urls[i:i+self.batch_processor.batch_size]
+            batch_results = await self.batch_processor.process_batch(self, batch)
+            all_results.extend(batch_results)
+        return all_results
\ No newline at end of file
diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py
new file mode 100644
index 00000000..9add962e
--- /dev/null
+++ b/crawl4ai/scraper/bfs_scraper_strategy.py
@@ -0,0 +1,50 @@
+from .scraper_strategy import ScraperStrategy
+from .filters import FilterChain
+from .scorers import URLScorer
+from .models import ScraperResult
+from ..models import CrawlResult
+from ..async_webcrawler import AsyncWebCrawler
+import asyncio
+from urllib.parse import urljoin
+
+class BFSScraperStrategy(ScraperStrategy):
+    def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer):
+        self.max_depth = max_depth
+        self.filter_chain = filter_chain
+        self.url_scorer = url_scorer
+
+    async def ascrape(self, start_url: str, initial_crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult:
+        queue = asyncio.PriorityQueue()
+        queue.put_nowait((0, 0, start_url))  # (score, depth, url)
+        visited = set()
+        crawled_urls = []
+        extracted_data = {}
+
+        while not queue.empty():
+            _, depth, url = await queue.get()
+            if depth > self.max_depth or url in visited:
+                continue
+            crawl_result = initial_crawl_result if url == start_url else await crawler.arun(url)
+            visited.add(url)
+            crawled_urls.append(url)
+            extracted_data[url]=crawl_result
+            if crawl_result.success == False:
+                print(f"failed to crawl -- {url}")
+                continue
+            for internal in crawl_result.links["internal"]:
+                link = internal['href']
+                is_special_uri = any(link.startswith(scheme) for scheme in ('tel:', 'mailto:', 'sms:', 'geo:', 'fax:', 'file:', 'data:', 'sip:', 'ircs:', 'magnet:'))
+                is_fragment = '#' in link
+                if not (is_fragment or is_special_uri):
+                    # To fix partial links: eg:'/support' to 'https://example.com/support'
+                    absolute_link = urljoin(url, link)
+                    if self.filter_chain.apply(absolute_link) and absolute_link not in visited:
+                        score = self.url_scorer.score(absolute_link)
+                        await queue.put((1 / score, depth + 1, absolute_link))
+            for external in crawl_result.links["external"]:
+                link = external['href']
+                if self.filter_chain.apply(link) and link not in visited:
+                    score = self.url_scorer.score(link)
+                    await queue.put((1 / score, depth + 1, link))
+
+        return ScraperResult(url=start_url, crawled_urls=crawled_urls, extracted_data=extracted_data)
\ No newline at end of file
diff --git a/crawl4ai/scraper/filters/__init__.py b/crawl4ai/scraper/filters/__init__.py
new file mode 100644
index 00000000..525c9bdb
--- /dev/null
+++ b/crawl4ai/scraper/filters/__init__.py
@@ -0,0 +1,3 @@
+from .url_filter import URLFilter, FilterChain
+from .content_type_filter import ContentTypeFilter
+from .url_pattern_filter import URLPatternFilter
\ No newline at end of file
diff --git a/crawl4ai/scraper/filters/content_type_filter.py b/crawl4ai/scraper/filters/content_type_filter.py
new file mode 100644
index 00000000..9173eb4a
--- /dev/null
+++ b/crawl4ai/scraper/filters/content_type_filter.py
@@ -0,0 +1,8 @@
+from .url_filter import URLFilter
+
+class ContentTypeFilter(URLFilter):
+    def __init__(self, contentType: str):
+        self.contentType = contentType
+    def apply(self, url: str) -> bool:
+        #TODO: This is a stub. Will implement this later
+        return True
\ No newline at end of file
diff --git a/crawl4ai/scraper/filters/url_filter.py b/crawl4ai/scraper/filters/url_filter.py
new file mode 100644
index 00000000..2b8bd6eb
--- /dev/null
+++ b/crawl4ai/scraper/filters/url_filter.py
@@ -0,0 +1,16 @@
+from abc import ABC, abstractmethod
+
+class URLFilter(ABC):
+    @abstractmethod
+    def apply(self, url: str) -> bool:
+        pass
+
+class FilterChain:
+    def __init__(self):
+        self.filters = []
+
+    def add_filter(self, filter: URLFilter):
+        self.filters.append(filter)
+
+    def apply(self, url: str) -> bool:
+        return all(filter.apply(url) for filter in self.filters)
\ No newline at end of file
diff --git a/crawl4ai/scraper/filters/url_pattern_filter.py b/crawl4ai/scraper/filters/url_pattern_filter.py
new file mode 100644
index 00000000..fd5df133
--- /dev/null
+++ b/crawl4ai/scraper/filters/url_pattern_filter.py
@@ -0,0 +1,9 @@
+from .url_filter import URLFilter
+from re import Pattern
+
+class URLPatternFilter(URLFilter):
+    def __init__(self, pattern: Pattern):
+        self.pattern = pattern
+    def apply(self, url: str) -> bool:
+        #TODO: This is a stub. Will implement this later.
+        return True
\ No newline at end of file
diff --git a/crawl4ai/scraper/models.py b/crawl4ai/scraper/models.py
new file mode 100644
index 00000000..9ffdac52
--- /dev/null
+++ b/crawl4ai/scraper/models.py
@@ -0,0 +1,7 @@
+from pydantic import BaseModel
+from typing import List, Dict
+
+class ScraperResult(BaseModel):
+    url: str
+    crawled_urls: List[str]
+    extracted_data: Dict
\ No newline at end of file
diff --git a/crawl4ai/scraper/scorers/__init__.py b/crawl4ai/scraper/scorers/__init__.py
new file mode 100644
index 00000000..05c61c94
--- /dev/null
+++ b/crawl4ai/scraper/scorers/__init__.py
@@ -0,0 +1,2 @@
+from .url_scorer import URLScorer
+from .keyword_relevance_scorer import KeywordRelevanceScorer
\ No newline at end of file
diff --git a/crawl4ai/scraper/scorers/keyword_relevance_scorer.py b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py
new file mode 100644
index 00000000..a2338aec
--- /dev/null
+++ b/crawl4ai/scraper/scorers/keyword_relevance_scorer.py
@@ -0,0 +1,9 @@
+from .url_scorer import URLScorer
+from typing import List
+
+class KeywordRelevanceScorer(URLScorer):
+    def __init__(self,keywords: List[str]):
+        self.keyworkds = keywords
+    def score(self, url: str) -> float:
+        #TODO: This is a stub. Will implement this later.
+        return 1
\ No newline at end of file
diff --git a/crawl4ai/scraper/scorers/url_scorer.py b/crawl4ai/scraper/scorers/url_scorer.py
new file mode 100644
index 00000000..6ee9ab05
--- /dev/null
+++ b/crawl4ai/scraper/scorers/url_scorer.py
@@ -0,0 +1,6 @@
+from abc import ABC, abstractmethod
+
+class URLScorer(ABC):
+    @abstractmethod
+    def score(self, url: str) -> float:
+        pass
\ No newline at end of file
diff --git a/crawl4ai/scraper/scraper_strategy.py b/crawl4ai/scraper/scraper_strategy.py
new file mode 100644
index 00000000..16df9ece
--- /dev/null
+++ b/crawl4ai/scraper/scraper_strategy.py
@@ -0,0 +1,9 @@
+from abc import ABC, abstractmethod
+from .models import ScraperResult
+from ..models import CrawlResult
+from ..async_webcrawler import AsyncWebCrawler
+
+class ScraperStrategy(ABC):
+    @abstractmethod
+    async def ascrape(self, url: str, crawl_result: CrawlResult, crawler: AsyncWebCrawler) -> ScraperResult:
+        pass
\ No newline at end of file