Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.

This commit is contained in:
Aravind Karnam
2024-11-23 13:52:34 +05:30
parent c1797037c0
commit f8e85b1499
6 changed files with 35 additions and 31 deletions

View File

@@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy):
links_ro_process = result.links["internal"]
if self.process_external_links:
links_ro_process += result.links["external"]
for link_type in links_ro_process:
for link in result.links[link_type]:
url = link['href']
# url = urljoin(source_url, link['href'])
# url = urlunparse(urlparse(url)._replace(fragment=""))
if url not in visited and await self.can_process_url(url):
new_depth = depths[source_url] + 1
if new_depth <= self.max_depth:
for link in links_ro_process:
url = link['href']
# url = urljoin(source_url, link['href'])
# url = urlunparse(urlparse(url)._replace(fragment=""))
if url not in visited and await self.can_process_url(url):
new_depth = depths[source_url] + 1
if new_depth <= self.max_depth:
if self.url_scorer:
score = self.url_scorer.score(url)
await queue.put((score, new_depth, url))
depths[url] = new_depth
self.stats.total_depth_reached = max(
self.stats.total_depth_reached,
new_depth
)
else:
# When no url_scorer is provided all urls will have same score of 0.
# Therefore will be process in FIFO order as per URL depth
score = 0
await queue.put((score, new_depth, url))
depths[url] = new_depth
self.stats.total_depth_reached = max(
self.stats.total_depth_reached,
new_depth
)
async def ascrape(
self,

View File

@@ -7,6 +7,7 @@ from crawl4ai.scraper import (
ContentTypeFilter
)
from crawl4ai.async_webcrawler import AsyncWebCrawler
import re
async def basic_scraper_example():
"""
@@ -18,7 +19,7 @@ async def basic_scraper_example():
# Create a simple filter chain
filter_chain = FilterChain([
# Only crawl pages within the blog section
URLPatternFilter("*/blog/*"),
# URLPatternFilter("*/tutorial/*"),
# Only process HTML pages
ContentTypeFilter(["text/html"])
])
@@ -32,20 +33,19 @@ async def basic_scraper_example():
)
# Create the crawler and scraper
crawler = AsyncWebCrawler()
scraper = AsyncWebScraper(crawler, strategy)
# Start scraping
try:
result = await scraper.ascrape("https://example.com/blog/")
# Process results
print(f"Crawled {len(result.crawled_urls)} pages:")
for url, data in result.extracted_data.items():
print(f"- {url}: {len(data.html)} bytes")
async with AsyncWebCrawler(verbose=True) as crawler:
scraper = AsyncWebScraper(crawler, strategy)
# Start scraping
try:
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
except Exception as e:
print(f"Error during scraping: {e}")
# Process results
print(f"Crawled {len(result.crawled_urls)} pages:")
for url, data in result.extracted_data.items():
print(f"- {url}: {len(data.html)} bytes")
except Exception as e:
print(f"Error during scraping: {e}")
# advanced_scraper_example.py
import logging
@@ -180,5 +180,5 @@ if __name__ == "__main__":
print("Running basic scraper example...")
asyncio.run(basic_scraper_example())
print("\nRunning advanced scraper example...")
asyncio.run(advanced_scraper_example())
# print("\nRunning advanced scraper example...")
# asyncio.run(advanced_scraper_example())