Fixed a bug in _process_links, handled condition for when url_scorer is passed as None, renamed the scrapper folder to scraper.
This commit is contained in:
@@ -192,22 +192,26 @@ class BFSScraperStrategy(ScraperStrategy):
|
|||||||
links_ro_process = result.links["internal"]
|
links_ro_process = result.links["internal"]
|
||||||
if self.process_external_links:
|
if self.process_external_links:
|
||||||
links_ro_process += result.links["external"]
|
links_ro_process += result.links["external"]
|
||||||
for link_type in links_ro_process:
|
for link in links_ro_process:
|
||||||
for link in result.links[link_type]:
|
url = link['href']
|
||||||
url = link['href']
|
# url = urljoin(source_url, link['href'])
|
||||||
# url = urljoin(source_url, link['href'])
|
# url = urlunparse(urlparse(url)._replace(fragment=""))
|
||||||
# url = urlunparse(urlparse(url)._replace(fragment=""))
|
|
||||||
|
|
||||||
if url not in visited and await self.can_process_url(url):
|
if url not in visited and await self.can_process_url(url):
|
||||||
new_depth = depths[source_url] + 1
|
new_depth = depths[source_url] + 1
|
||||||
if new_depth <= self.max_depth:
|
if new_depth <= self.max_depth:
|
||||||
|
if self.url_scorer:
|
||||||
score = self.url_scorer.score(url)
|
score = self.url_scorer.score(url)
|
||||||
await queue.put((score, new_depth, url))
|
else:
|
||||||
depths[url] = new_depth
|
# When no url_scorer is provided all urls will have same score of 0.
|
||||||
self.stats.total_depth_reached = max(
|
# Therefore will be process in FIFO order as per URL depth
|
||||||
self.stats.total_depth_reached,
|
score = 0
|
||||||
new_depth
|
await queue.put((score, new_depth, url))
|
||||||
)
|
depths[url] = new_depth
|
||||||
|
self.stats.total_depth_reached = max(
|
||||||
|
self.stats.total_depth_reached,
|
||||||
|
new_depth
|
||||||
|
)
|
||||||
|
|
||||||
async def ascrape(
|
async def ascrape(
|
||||||
self,
|
self,
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from crawl4ai.scraper import (
|
|||||||
ContentTypeFilter
|
ContentTypeFilter
|
||||||
)
|
)
|
||||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||||
|
import re
|
||||||
|
|
||||||
async def basic_scraper_example():
|
async def basic_scraper_example():
|
||||||
"""
|
"""
|
||||||
@@ -18,7 +19,7 @@ async def basic_scraper_example():
|
|||||||
# Create a simple filter chain
|
# Create a simple filter chain
|
||||||
filter_chain = FilterChain([
|
filter_chain = FilterChain([
|
||||||
# Only crawl pages within the blog section
|
# Only crawl pages within the blog section
|
||||||
URLPatternFilter("*/blog/*"),
|
# URLPatternFilter("*/tutorial/*"),
|
||||||
# Only process HTML pages
|
# Only process HTML pages
|
||||||
ContentTypeFilter(["text/html"])
|
ContentTypeFilter(["text/html"])
|
||||||
])
|
])
|
||||||
@@ -32,20 +33,19 @@ async def basic_scraper_example():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create the crawler and scraper
|
# Create the crawler and scraper
|
||||||
crawler = AsyncWebCrawler()
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||||
scraper = AsyncWebScraper(crawler, strategy)
|
scraper = AsyncWebScraper(crawler, strategy)
|
||||||
|
# Start scraping
|
||||||
|
try:
|
||||||
|
result = await scraper.ascrape("https://crawl4ai.com/mkdocs")
|
||||||
|
|
||||||
# Start scraping
|
# Process results
|
||||||
try:
|
print(f"Crawled {len(result.crawled_urls)} pages:")
|
||||||
result = await scraper.ascrape("https://example.com/blog/")
|
for url, data in result.extracted_data.items():
|
||||||
|
print(f"- {url}: {len(data.html)} bytes")
|
||||||
|
|
||||||
# Process results
|
except Exception as e:
|
||||||
print(f"Crawled {len(result.crawled_urls)} pages:")
|
print(f"Error during scraping: {e}")
|
||||||
for url, data in result.extracted_data.items():
|
|
||||||
print(f"- {url}: {len(data.html)} bytes")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error during scraping: {e}")
|
|
||||||
|
|
||||||
# advanced_scraper_example.py
|
# advanced_scraper_example.py
|
||||||
import logging
|
import logging
|
||||||
@@ -180,5 +180,5 @@ if __name__ == "__main__":
|
|||||||
print("Running basic scraper example...")
|
print("Running basic scraper example...")
|
||||||
asyncio.run(basic_scraper_example())
|
asyncio.run(basic_scraper_example())
|
||||||
|
|
||||||
print("\nRunning advanced scraper example...")
|
# print("\nRunning advanced scraper example...")
|
||||||
asyncio.run(advanced_scraper_example())
|
# asyncio.run(advanced_scraper_example())
|
||||||
Reference in New Issue
Block a user