Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.

This commit is contained in:
Aravind Karnam
2024-11-23 13:52:34 +05:30
parent c1797037c0
commit f8e85b1499
6 changed files with 35 additions and 31 deletions

View File

@@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy):
links_ro_process = result.links["internal"] links_ro_process = result.links["internal"]
if self.process_external_links: if self.process_external_links:
links_ro_process += result.links["external"] links_ro_process += result.links["external"]
for link_type in links_ro_process: for link in links_ro_process:
for link in result.links[link_type]: url = link['href']
url = link['href'] # url = urljoin(source_url, link['href'])
# url = urljoin(source_url, link['href']) # url = urlunparse(urlparse(url)._replace(fragment=""))
# url = urlunparse(urlparse(url)._replace(fragment=""))
if url not in visited and await self.can_process_url(url): if url not in visited and await self.can_process_url(url):
new_depth = depths[source_url] + 1 new_depth = depths[source_url] + 1
if new_depth <= self.max_depth: if new_depth <= self.max_depth:
if self.url_scorer:
score = self.url_scorer.score(url) score = self.url_scorer.score(url)
await queue.put((score, new_depth, url)) else:
depths[url] = new_depth # When no url_scorer is provided all urls will have same score of 0.
self.stats.total_depth_reached = max( # Therefore will be process in FIFO order as per URL depth
self.stats.total_depth_reached, score = 0
new_depth await queue.put((score, new_depth, url))
) depths[url] = new_depth
self.stats.total_depth_reached = max(
self.stats.total_depth_reached,
new_depth
)
async def ascrape( async def ascrape(
self, self,

View File

@@ -7,6 +7,7 @@ from crawl4ai.scraper import (
ContentTypeFilter ContentTypeFilter
) )
from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_webcrawler import AsyncWebCrawler
import re
async def basic_scraper_example(): async def basic_scraper_example():
""" """
@@ -18,7 +19,7 @@ async def basic_scraper_example():
# Create a simple filter chain # Create a simple filter chain
filter_chain = FilterChain([ filter_chain = FilterChain([
# Only crawl pages within the blog section # Only crawl pages within the blog section
URLPatternFilter("*/blog/*"), # URLPatternFilter("*/tutorial/*"),
# Only process HTML pages # Only process HTML pages
ContentTypeFilter(["text/html"]) ContentTypeFilter(["text/html"])
]) ])
@@ -32,20 +33,19 @@ async def basic_scraper_example():
) )
# Create the crawler and scraper # Create the crawler and scraper
crawler = AsyncWebCrawler() async with AsyncWebCrawler(verbose=True) as crawler:
scraper = AsyncWebScraper(crawler, strategy) scraper = AsyncWebScraper(crawler, strategy)
# Start scraping
try:
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
# Start scraping # Process results
try: print(f"Crawled {len(result.crawled_urls)} pages:")
result = await scraper.ascrape("https://example.com/blog/") for url, data in result.extracted_data.items():
print(f"- {url}: {len(data.html)} bytes")
# Process results except Exception as e:
print(f"Crawled {len(result.crawled_urls)} pages:") print(f"Error during scraping: {e}")
for url, data in result.extracted_data.items():
print(f"- {url}: {len(data.html)} bytes")
except Exception as e:
print(f"Error during scraping: {e}")
# advanced_scraper_example.py # advanced_scraper_example.py
import logging import logging
@@ -180,5 +180,5 @@ if __name__ == "__main__":
print("Running basic scraper example...") print("Running basic scraper example...")
asyncio.run(basic_scraper_example()) asyncio.run(basic_scraper_example())
print("\nRunning advanced scraper example...") # print("\nRunning advanced scraper example...")
asyncio.run(advanced_scraper_example()) # asyncio.run(advanced_scraper_example())