From f8e85b149939eea296805a680c080bbac884f269 Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Sat, 23 Nov 2024 13:52:34 +0530 Subject: [PATCH] Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper. --- crawl4ai/scraper/bfs_scraper_strategy.py | 34 +++++++++++-------- .../async_web_scraper.md | 0 .../bfs_scraper_strategy.md | 0 .../{scrapper => scraper}/filters_scrorers.md | 0 docs/{scrapper => scraper}/how_to_use.md | 0 .../scraper_quickstart.py | 32 ++++++++--------- 6 files changed, 35 insertions(+), 31 deletions(-) rename docs/{scrapper => scraper}/async_web_scraper.md (100%) rename docs/{scrapper => scraper}/bfs_scraper_strategy.md (100%) rename docs/{scrapper => scraper}/filters_scrorers.md (100%) rename docs/{scrapper => scraper}/how_to_use.md (100%) rename docs/{scrapper => scraper}/scraper_quickstart.py (88%) diff --git a/crawl4ai/scraper/bfs_scraper_strategy.py b/crawl4ai/scraper/bfs_scraper_strategy.py index cae7ba90..72a86203 100644 --- a/crawl4ai/scraper/bfs_scraper_strategy.py +++ b/crawl4ai/scraper/bfs_scraper_strategy.py @@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy): links_ro_process = result.links["internal"] if self.process_external_links: links_ro_process += result.links["external"] - for link_type in links_ro_process: - for link in result.links[link_type]: - url = link['href'] - # url = urljoin(source_url, link['href']) - # url = urlunparse(urlparse(url)._replace(fragment="")) - - if url not in visited and await self.can_process_url(url): - new_depth = depths[source_url] + 1 - if new_depth <= self.max_depth: + for link in links_ro_process: + url = link['href'] + # url = urljoin(source_url, link['href']) + # url = urlunparse(urlparse(url)._replace(fragment="")) + + if url not in visited and await self.can_process_url(url): + new_depth = depths[source_url] + 1 + if new_depth <= self.max_depth: + if self.url_scorer: score = self.url_scorer.score(url) - await queue.put((score, new_depth, url)) - depths[url] = new_depth - self.stats.total_depth_reached = max( - self.stats.total_depth_reached, - new_depth - ) + else: + # When no url_scorer is provided all urls will have same score of 0. + # Therefore will be process in FIFO order as per URL depth + score = 0 + await queue.put((score, new_depth, url)) + depths[url] = new_depth + self.stats.total_depth_reached = max( + self.stats.total_depth_reached, + new_depth + ) async def ascrape( self, diff --git a/docs/scrapper/async_web_scraper.md b/docs/scraper/async_web_scraper.md similarity index 100% rename from docs/scrapper/async_web_scraper.md rename to docs/scraper/async_web_scraper.md diff --git a/docs/scrapper/bfs_scraper_strategy.md b/docs/scraper/bfs_scraper_strategy.md similarity index 100% rename from docs/scrapper/bfs_scraper_strategy.md rename to docs/scraper/bfs_scraper_strategy.md diff --git a/docs/scrapper/filters_scrorers.md b/docs/scraper/filters_scrorers.md similarity index 100% rename from docs/scrapper/filters_scrorers.md rename to docs/scraper/filters_scrorers.md diff --git a/docs/scrapper/how_to_use.md b/docs/scraper/how_to_use.md similarity index 100% rename from docs/scrapper/how_to_use.md rename to docs/scraper/how_to_use.md diff --git a/docs/scrapper/scraper_quickstart.py b/docs/scraper/scraper_quickstart.py similarity index 88% rename from docs/scrapper/scraper_quickstart.py rename to docs/scraper/scraper_quickstart.py index a2c7a239..811f997e 100644 --- a/docs/scrapper/scraper_quickstart.py +++ b/docs/scraper/scraper_quickstart.py @@ -7,6 +7,7 @@ from crawl4ai.scraper import ( ContentTypeFilter ) from crawl4ai.async_webcrawler import AsyncWebCrawler +import re async def basic_scraper_example(): """ @@ -18,7 +19,7 @@ async def basic_scraper_example(): # Create a simple filter chain filter_chain = FilterChain([ # Only crawl pages within the blog section - URLPatternFilter("*/blog/*"), + # URLPatternFilter("*/tutorial/*"), # Only process HTML pages ContentTypeFilter(["text/html"]) ]) @@ -32,20 +33,19 @@ async def basic_scraper_example(): ) # Create the crawler and scraper - crawler = AsyncWebCrawler() - scraper = AsyncWebScraper(crawler, strategy) - - # Start scraping - try: - result = await scraper.ascrape("https://example.com/blog/") - - # Process results - print(f"Crawled {len(result.crawled_urls)} pages:") - for url, data in result.extracted_data.items(): - print(f"- {url}: {len(data.html)} bytes") + async with AsyncWebCrawler(verbose=True) as crawler: + scraper = AsyncWebScraper(crawler, strategy) + # Start scraping + try: + result = await scraper.ascrape("https://crawl4ai.com/mkdocs") - except Exception as e: - print(f"Error during scraping: {e}") + # Process results + print(f"Crawled {len(result.crawled_urls)} pages:") + for url, data in result.extracted_data.items(): + print(f"- {url}: {len(data.html)} bytes") + + except Exception as e: + print(f"Error during scraping: {e}") # advanced_scraper_example.py import logging @@ -180,5 +180,5 @@ if __name__ == "__main__": print("Running basic scraper example...") asyncio.run(basic_scraper_example()) - print("\nRunning advanced scraper example...") - asyncio.run(advanced_scraper_example()) \ No newline at end of file + # print("\nRunning advanced scraper example...") + # asyncio.run(advanced_scraper_example()) \ No newline at end of file