Compare commits

...

16 Commits

Author SHA1 Message Date
UncleCode
d21ffad3a2 chore(git): update gitignore patterns
Add new development and tooling related patterns to gitignore:
- Add Next.js build directory (.next/)
- Add various script and documentation files
- Add local development directories (.local, .scripts, .do)
- Add tool-specific files (.codeiumignore, .windsurfrules)

Removes duplicate entries and organizes patterns more clearly.
2025-01-22 17:22:26 +08:00
UncleCode
06b21dcc50 Update .gitignore to include new directories for issues and documentation 2024-11-06 18:44:03 +08:00
UncleCode
0f0f60527d Merge pull request #172 from aravindkarnam/scraper
Scraper
2024-11-06 07:00:44 +01:00
Aravind Karnam
8105fd178e Removed stubs for remove_from_future_crawls since the visited set is updated soon as the URL was queued, Removed add_to_retry_queue(url) since retry with exponential backoff with help of tenacity is going to take care of it. 2024-10-17 15:42:43 +05:30
Aravind Karnam
ce7fce4b16 1. Moved to asyncio.wait instead of gather so that results can be yeilded just as they are ready, rather than in batches
2. Moved the visted.add(url), to before the task is put in queue rather than after the crawl is completed. This makes sure that  duplicate crawls doesn't happen when same URL is found at different depth and that get's queued too because the crawl is not yet completed and visted set is not updated.
3. Named the yield_results attribute to stream instead. Since that seems to be popularly used in all other AI libraries for intermediate results.
2024-10-17 12:25:17 +05:30
Aravind Karnam
de28b59aca removed unused imports 2024-10-16 22:36:48 +05:30
Aravind Karnam
04d8b47b92 Exposed min_crawl_delay for BFSScraperStrategy 2024-10-16 22:34:54 +05:30
Aravind Karnam
2943feeecf 1. Added a flag to yield each crawl result,as they become ready along with the final scraper result as another option
2. Removed ascrape_many method, as I'm currently not focusing on it in the first cut of scraper
3. Added some error handling for cases where robots.txt cannot be fetched or parsed.
2024-10-16 22:05:29 +05:30
Aravind Karnam
8a7d29ce85 updated some comments and removed content type checking functionality from core as it's implemented as a filter 2024-10-16 15:59:37 +05:30
aravind
159bd875bd Merge pull request #5 from aravindkarnam/main
Merging 0.3.6
2024-10-16 10:41:22 +05:30
Aravind Karnam
d743adac68 Fixed some bugs in robots.txt processing 2024-10-03 15:58:57 +05:30
Aravind Karnam
7fe220dbd5 1. Introduced a bool flag to ascrape method to switch between sequential and concurrent processing
2. Introduced a dictionary for depth tracking across various tasks
3. Removed redundancy with crawled_urls variable. Instead created a list with visited set variable in returned object.
2024-10-03 11:17:11 +05:30
aravind
65e013d9d1 Merge pull request #3 from aravindkarnam/main
Merging latest changes from main branch
2024-10-03 09:52:12 +05:30
Aravind Karnam
7f3e2e47ed Parallel processing with retry on failure with exponential backoff - Simplified URL validation and normalisation - respecting Robots.txt 2024-09-19 12:34:12 +05:30
aravind
78f26ac263 Merge pull request #2 from aravindkarnam/staging
Staging
2024-09-18 18:16:23 +05:30
Aravind Karnam
44ce12c62c Created scaffolding for Scraper as per the plan. Implemented the ascrape method in bfs_scraper_strategy 2024-09-09 13:13:34 +05:30
13 changed files with 289 additions and 1 deletions

29
.gitignore vendored
View File

@@ -199,8 +199,35 @@ test_env/
**/.DS_Store
todo.md
todo_executor.md
git_changes.py
git_changes.md
pypi_build.sh
git_issues.py
git_issues.md
.tests/
.next/
.tests/
# .issues/
.docs/
.issues/
.gitboss/
todo_executor.md
protect-all-except-feature.sh
manage-collab.sh
publish.sh
combine.sh
combined_output.txt
.local
.scripts
tree.md
tree.md
.scripts
.local
.do
/plans
.codeiumignore
todo/
# windsurf rules
.windsurfrules

View File

@@ -0,0 +1,2 @@
from .async_web_scraper import AsyncWebScraper
from .bfs_scraper_strategy import BFSScraperStrategy

View File

@@ -0,0 +1,33 @@
from .scraper_strategy import ScraperStrategy
from .models import ScraperResult, CrawlResult
from ..async_webcrawler import AsyncWebCrawler
from typing import Union, AsyncGenerator
class AsyncWebScraper:
def __init__(self, crawler: AsyncWebCrawler, strategy: ScraperStrategy):
self.crawler = crawler
self.strategy = strategy
async def ascrape(self, url: str, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
if stream:
return self._ascrape_yielding(url, parallel_processing)
else:
return await self._ascrape_collecting(url, parallel_processing)
async def _ascrape_yielding(self, url: str, parallel_processing: bool) -> AsyncGenerator[CrawlResult, None]:
result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
async for res in result_generator: # Consume the async generator
yield res # Yielding individual results
async def _ascrape_collecting(self, url: str, parallel_processing: bool) -> ScraperResult:
extracted_data = {}
result_generator = self.strategy.ascrape(url, self.crawler, parallel_processing)
async for res in result_generator: # Consume the async generator
extracted_data[res.url] = res
# Return a final ScraperResult
return ScraperResult(
url=url,
crawled_urls=list(extracted_data.keys()),
extracted_data=extracted_data
)

View File

@@ -0,0 +1,139 @@
from .scraper_strategy import ScraperStrategy
from .filters import FilterChain
from .scorers import URLScorer
from ..models import CrawlResult
from ..async_webcrawler import AsyncWebCrawler
import asyncio
import validators
from urllib.parse import urljoin,urlparse,urlunparse
from urllib.robotparser import RobotFileParser
import time
from aiolimiter import AsyncLimiter
from tenacity import retry, stop_after_attempt, wait_exponential
from collections import defaultdict
import logging
from typing import Dict, AsyncGenerator
logging.basicConfig(level=logging.DEBUG)
rate_limiter = AsyncLimiter(1, 1) # 1 request per second
class BFSScraperStrategy(ScraperStrategy):
def __init__(self, max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, max_concurrent: int = 5, min_crawl_delay: int=1):
self.max_depth = max_depth
self.filter_chain = filter_chain
self.url_scorer = url_scorer
self.max_concurrent = max_concurrent
# For Crawl Politeness
self.last_crawl_time = defaultdict(float)
self.min_crawl_delay = min_crawl_delay # 1 second delay between requests to the same domain
# For Robots.txt Compliance
self.robot_parsers = {}
# Robots.txt Parser
def get_robot_parser(self, url: str) -> RobotFileParser:
domain = urlparse(url)
scheme = domain.scheme if domain.scheme else 'http' # Default to 'http' if no scheme provided
netloc = domain.netloc
if netloc not in self.robot_parsers:
rp = RobotFileParser()
rp.set_url(f"{scheme}://{netloc}/robots.txt")
try:
rp.read()
except Exception as e:
# Log the type of error, message, and the URL
logging.warning(f"Error {type(e).__name__} occurred while fetching robots.txt for {netloc}: {e}")
return None
self.robot_parsers[netloc] = rp
return self.robot_parsers[netloc]
# Retry with exponential backoff
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def retry_crawl(self, crawler: AsyncWebCrawler, url: str) -> CrawlResult:
return await crawler.arun(url)
async def process_url(self, url: str, depth: int, crawler: AsyncWebCrawler, queue: asyncio.PriorityQueue, visited: set, depths: Dict[str, int]) -> AsyncGenerator[CrawlResult, None]:
def normalize_url(url: str) -> str:
parsed = urlparse(url)
return urlunparse(parsed._replace(fragment=""))
# URL Validation
if not validators.url(url):
logging.warning(f"Invalid URL: {url}")
return None
# Robots.txt Compliance
robot_parser = self.get_robot_parser(url)
if robot_parser is None:
logging.info(f"Could not retrieve robots.txt for {url}, hence proceeding with crawl.")
else:
# If robots.txt was fetched, check if crawling is allowed
if not robot_parser.can_fetch(crawler.crawler_strategy.user_agent, url):
logging.info(f"Skipping {url} as per robots.txt")
return None
# Crawl Politeness
domain = urlparse(url).netloc
time_since_last_crawl = time.time() - self.last_crawl_time[domain]
if time_since_last_crawl < self.min_crawl_delay:
await asyncio.sleep(self.min_crawl_delay - time_since_last_crawl)
self.last_crawl_time[domain] = time.time()
# Rate Limiting
async with rate_limiter:
# Error Handling
try:
crawl_result = await self.retry_crawl(crawler, url)
except Exception as e:
logging.error(f"Error crawling {url}: {str(e)}")
crawl_result = CrawlResult(url=url, html="", success=False, status_code=0, error_message=str(e))
if not crawl_result.success:
# Logging and Monitoring
logging.error(f"Failed to crawl URL: {url}. Error: {crawl_result.error_message}")
return crawl_result
# Process links
for link_type in ["internal", "external"]:
for link in crawl_result.links[link_type]:
absolute_link = urljoin(url, link['href'])
normalized_link = normalize_url(absolute_link)
if self.filter_chain.apply(normalized_link) and normalized_link not in visited:
new_depth = depths[url] + 1
if new_depth <= self.max_depth:
# URL Scoring
score = self.url_scorer.score(normalized_link)
await queue.put((score, new_depth, normalized_link))
depths[normalized_link] = new_depth
return crawl_result
async def ascrape(self, start_url: str, crawler: AsyncWebCrawler, parallel_processing:bool = True) -> AsyncGenerator[CrawlResult,None]:
queue = asyncio.PriorityQueue()
queue.put_nowait((0, 0, start_url))
visited = set()
depths = {start_url: 0}
pending_tasks = set()
while not queue.empty() or pending_tasks:
while not queue.empty() and len(pending_tasks) < self.max_concurrent:
_, depth, url = await queue.get()
if url not in visited:
# Adding URL to the visited set here itself, (instead of after result generation)
# so that other tasks are not queued for same URL, found at different depth before
# crawling and extraction of this task is completed.
visited.add(url)
if parallel_processing:
task = asyncio.create_task(self.process_url(url, depth, crawler, queue, visited, depths))
pending_tasks.add(task)
else:
result = await self.process_url(url, depth, crawler, queue, visited, depths)
if result:
yield result
# Wait for the first task to complete and yield results incrementally as each task is completed
if pending_tasks:
done, pending_tasks = await asyncio.wait(pending_tasks, return_when=asyncio.FIRST_COMPLETED)
for task in done:
result = await task
if result:
yield result

View File

@@ -0,0 +1,3 @@
from .url_filter import URLFilter, FilterChain
from .content_type_filter import ContentTypeFilter
from .url_pattern_filter import URLPatternFilter

View File

@@ -0,0 +1,8 @@
from .url_filter import URLFilter
class ContentTypeFilter(URLFilter):
def __init__(self, contentType: str):
self.contentType = contentType
def apply(self, url: str) -> bool:
#TODO: This is a stub. Will implement this later
return True

View File

@@ -0,0 +1,16 @@
from abc import ABC, abstractmethod
class URLFilter(ABC):
@abstractmethod
def apply(self, url: str) -> bool:
pass
class FilterChain:
def __init__(self):
self.filters = []
def add_filter(self, filter: URLFilter):
self.filters.append(filter)
def apply(self, url: str) -> bool:
return all(filter.apply(url) for filter in self.filters)

View File

@@ -0,0 +1,9 @@
from .url_filter import URLFilter
from re import Pattern
class URLPatternFilter(URLFilter):
def __init__(self, pattern: Pattern):
self.pattern = pattern
def apply(self, url: str) -> bool:
#TODO: This is a stub. Will implement this later.
return True

View File

@@ -0,0 +1,8 @@
from pydantic import BaseModel
from typing import List, Dict
from ..models import CrawlResult
class ScraperResult(BaseModel):
url: str
crawled_urls: List[str]
extracted_data: Dict[str,CrawlResult]

View File

@@ -0,0 +1,2 @@
from .url_scorer import URLScorer
from .keyword_relevance_scorer import KeywordRelevanceScorer

View File

@@ -0,0 +1,9 @@
from .url_scorer import URLScorer
from typing import List
class KeywordRelevanceScorer(URLScorer):
def __init__(self,keywords: List[str]):
self.keyworkds = keywords
def score(self, url: str) -> float:
#TODO: This is a stub. Will implement this later.
return 1

View File

@@ -0,0 +1,6 @@
from abc import ABC, abstractmethod
class URLScorer(ABC):
@abstractmethod
def score(self, url: str) -> float:
pass

View File

@@ -0,0 +1,26 @@
from abc import ABC, abstractmethod
from .models import ScraperResult, CrawlResult
from ..models import CrawlResult
from ..async_webcrawler import AsyncWebCrawler
from typing import Union, AsyncGenerator
class ScraperStrategy(ABC):
@abstractmethod
async def ascrape(self, url: str, crawler: AsyncWebCrawler, parallel_processing: bool = True, stream: bool = False) -> Union[AsyncGenerator[CrawlResult, None], ScraperResult]:
"""Scrape the given URL using the specified crawler.
Args:
url (str): The starting URL for the scrape.
crawler (AsyncWebCrawler): The web crawler instance.
parallel_processing (bool): Whether to use parallel processing. Defaults to True.
stream (bool): If True, yields individual crawl results as they are ready;
if False, accumulates results and returns a final ScraperResult.
Yields:
CrawlResult: Individual crawl results if stream is True.
Returns:
ScraperResult: A summary of the scrape results containing the final extracted data
and the list of crawled URLs if stream is False.
"""
pass