From b13fd71040184851c367bd287d07772459eeb07a Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 26 Nov 2024 10:07:11 +0530 Subject: [PATCH] chore: 1. Expose process_external_links as a param 2. Removed a few unused imports 3. Removed URL normalisation for external links separately as that won't be necessary --- crawl4ai/scraper/bfs_scraper_strategy.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index b87412ef..c12bf42e 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -1,10 +1,9 @@ -from abc import ABC, abstractmethod -from typing import Union, AsyncGenerator, Optional, Dict, Set +from typing import AsyncGenerator, Optional, Dict, Set from dataclasses import dataclass from datetime import datetime import asyncio import logging -from urllib.parse import urljoin, urlparse, urlunparse +from urllib.parse import urlparse from urllib.robotparser import RobotFileParser import validators import time @@ -12,7 +11,7 @@ from aiolimiter import AsyncLimiter from tenacity import retry, stop_after_attempt, wait_exponential from collections import defaultdict -from .models import ScraperResult, CrawlResult +from .models import CrawlResult from .filters import FilterChain from .scorers import URLScorer from ..async_webcrawler import AsyncWebCrawler @@ -37,6 +36,7 @@ class BFSScraperStrategy(ScraperStrategy): max_depth: int, filter_chain: FilterChain, url_scorer: URLScorer, + process_external_links: bool = False, max_concurrent: int = 5, min_crawl_delay: int = 1, timeout: int = 30, @@ -53,7 +53,7 @@ class BFSScraperStrategy(ScraperStrategy): # Crawl control self.stats = CrawlStats(start_time=datetime.now()) self._cancel_event = asyncio.Event() - self.process_external_links = False + self.process_external_links = process_external_links # Rate limiting and politeness self.rate_limiter = AsyncLimiter(1, 1) @@ -189,14 +189,11 @@ class BFSScraperStrategy(ScraperStrategy): Adds valid URLs to the queue Updates maximum depth statistics """ - links_ro_process = result.links["internal"] + links_to_process = result.links["internal"] if self.process_external_links: - links_ro_process += result.links["external"] - for link in links_ro_process: + links_to_process += result.links["external"] + for link in links_to_process: url = link['href'] - # url = urljoin(source_url, link['href']) - # url = urlunparse(urlparse(url)._replace(fragment="")) - if url not in visited and await self.can_process_url(url): new_depth = depths[source_url] + 1 if new_depth <= self.max_depth: