2025 feb alpha 1 (#685)

* spelling change in prompt * gpt-4o-mini support * Remove leading Y before here * prompt spell correction * (Docs) Fix numbered list end-of-line formatting Added the missing "two spaces" to add a line break * fix: access downloads_path through browser_config in _handle_download method - Fixes #585 * crawl * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/583 * Docs update: https://github.com/unclecode/crawl4ai/issues/649 * fix: https://github.com/unclecode/crawl4ai/issues/570 * Docs: updated example for content-selection to reflect new changes in yc newsfeed css * Refactor: Removed old filters and replaced with optimised filters * fix:Fixed imports as per the new names of filters * Tests: For deep crawl filters * Refactor: Remove old scorers and replace with optimised ones: Fix imports forall filters and scorers. * fix: awaiting on filters that are async in nature eg: content relevance and seo filters * fix: https://github.com/unclecode/crawl4ai/issues/592 * fix: https://github.com/unclecode/crawl4ai/issues/715 --------- Co-authored-by: DarshanTank <darshan.tank@gnani.ai> Co-authored-by: Tuhin Mallick <tuhin.mllk@gmail.com> Co-authored-by: Serhat Soydan <ssoydan@gmail.com> Co-authored-by: cardit1 <maneesh@cardit.in> Co-authored-by: Tautik Agrahari <tautikagrahari@gmail.com>
2025-02-19 11:43:17 +05:30
parent c171891999
commit dad592c801
19 changed files with 833 additions and 1350 deletions
--- a/tests/20241401/test_advanced_deep_crawl.py
+++ b/tests/20241401/test_advanced_deep_crawl.py
@@ -0,0 +1,46 @@
+import asyncio
+import time
+
+
+from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
+from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
+from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, DomainFilter, ContentTypeFilter, ContentRelevanceFilter
+from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
+# from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, BestFirstCrawlingStrategy
+
+
+async def main():
+    """Example deep crawl of documentation site."""
+    filter_chain = FilterChain([
+        URLPatternFilter(patterns=["*2025*"]),
+        DomainFilter(allowed_domains=["techcrunch.com"]),
+        ContentRelevanceFilter(query="Use of artificial intelligence in Defence applications", threshold=1),
+        ContentTypeFilter(allowed_types=["text/html","application/javascript"])
+    ])
+    config = CrawlerRunConfig(
+        deep_crawl_strategy = BestFirstCrawlingStrategy(
+            max_depth=2,
+            include_external=False,
+            filter_chain=filter_chain,
+            url_scorer=KeywordRelevanceScorer(keywords=["anduril", "defence", "AI"]),
+        ),
+        stream=False,
+        verbose=True,
+        cache_mode=CacheMode.BYPASS,
+        scraping_strategy=LXMLWebScrapingStrategy()
+    )
+
+    async with AsyncWebCrawler() as crawler:
+        print("Starting deep crawl in streaming mode:")
+        config.stream = True
+        start_time = time.perf_counter()
+        async for result in await crawler.arun(
+            url="https://techcrunch.com",
+            config=config
+        ):
+            print(f"→ {result.url} (Depth: {result.metadata.get('depth', 0)})")
+        print(f"Duration: {time.perf_counter() - start_time:.2f} seconds")
+
+if __name__ == "__main__":
+    asyncio.run(main())