Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.
This commit is contained in:
@@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy):
|
||||
links_ro_process = result.links["internal"]
|
||||
if self.process_external_links:
|
||||
links_ro_process += result.links["external"]
|
||||
for link_type in links_ro_process:
|
||||
for link in result.links[link_type]:
|
||||
url = link['href']
|
||||
# url = urljoin(source_url, link['href'])
|
||||
# url = urlunparse(urlparse(url)._replace(fragment=""))
|
||||
|
||||
if url not in visited and await self.can_process_url(url):
|
||||
new_depth = depths[source_url] + 1
|
||||
if new_depth <= self.max_depth:
|
||||
for link in links_ro_process:
|
||||
url = link['href']
|
||||
# url = urljoin(source_url, link['href'])
|
||||
# url = urlunparse(urlparse(url)._replace(fragment=""))
|
||||
|
||||
if url not in visited and await self.can_process_url(url):
|
||||
new_depth = depths[source_url] + 1
|
||||
if new_depth <= self.max_depth:
|
||||
if self.url_scorer:
|
||||
score = self.url_scorer.score(url)
|
||||
await queue.put((score, new_depth, url))
|
||||
depths[url] = new_depth
|
||||
self.stats.total_depth_reached = max(
|
||||
self.stats.total_depth_reached,
|
||||
new_depth
|
||||
)
|
||||
else:
|
||||
# When no url_scorer is provided all urls will have same score of 0.
|
||||
# Therefore will be process in FIFO order as per URL depth
|
||||
score = 0
|
||||
await queue.put((score, new_depth, url))
|
||||
depths[url] = new_depth
|
||||
self.stats.total_depth_reached = max(
|
||||
self.stats.total_depth_reached,
|
||||
new_depth
|
||||
)
|
||||
|
||||
async def ascrape(
|
||||
self,
|
||||
|
||||
@@ -7,6 +7,7 @@ from crawl4ai.scraper import (
|
||||
ContentTypeFilter
|
||||
)
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
import re
|
||||
|
||||
async def basic_scraper_example():
|
||||
"""
|
||||
@@ -18,7 +19,7 @@ async def basic_scraper_example():
|
||||
# Create a simple filter chain
|
||||
filter_chain = FilterChain([
|
||||
# Only crawl pages within the blog section
|
||||
URLPatternFilter("*/blog/*"),
|
||||
# URLPatternFilter("*/tutorial/*"),
|
||||
# Only process HTML pages
|
||||
ContentTypeFilter(["text/html"])
|
||||
])
|
||||
@@ -32,20 +33,19 @@ async def basic_scraper_example():
|
||||
)
|
||||
|
||||
# Create the crawler and scraper
|
||||
crawler = AsyncWebCrawler()
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
|
||||
# Start scraping
|
||||
try:
|
||||
result = await scraper.ascrape("https://example.com/blog/")
|
||||
|
||||
# Process results
|
||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
||||
for url, data in result.extracted_data.items():
|
||||
print(f"- {url}: {len(data.html)} bytes")
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
scraper = AsyncWebScraper(crawler, strategy)
|
||||
# Start scraping
|
||||
try:
|
||||
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
# Process results
|
||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
||||
for url, data in result.extracted_data.items():
|
||||
print(f"- {url}: {len(data.html)} bytes")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during scraping: {e}")
|
||||
|
||||
# advanced_scraper_example.py
|
||||
import logging
|
||||
@@ -180,5 +180,5 @@ if __name__ == "__main__":
|
||||
print("Running basic scraper example...")
|
||||
asyncio.run(basic_scraper_example())
|
||||
|
||||
print("\nRunning advanced scraper example...")
|
||||
asyncio.run(advanced_scraper_example())
|
||||
# print("\nRunning advanced scraper example...")
|
||||
# asyncio.run(advanced_scraper_example())
|
||||
Reference in New Issue
Block a user