Refactored web scraping components

- Enhanced the web scraping strategy with new methods for optimized media handling. - Added new utility functions for better content processing. - Refined existing features for improved accuracy and efficiency in scraping tasks. - Introduced more robust filtering criteria for media elements.
2024-12-05 22:33:47 +08:00
parent 486db3a771
commit 8c611dcb4b
4 changed files with 408 additions and 430 deletions
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -547,6 +547,7 @@ async def generate_knowledge_graph():
            f.write(result.extracted_content)

 async def fit_markdown_remove_overlay():
+    
    async with AsyncWebCrawler(
            headless=True,  # Set to False to see what is happening
            verbose=True,
@@ -560,13 +561,15 @@ async def fit_markdown_remove_overlay():
            url='https://www.kidocode.com/degrees/technology',
            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0),
+                content_filter=PruningContentFilter(
+                    threshold=0.48, threshold_type="fixed", min_word_threshold=0
+                ),
                options={
                    "ignore_links": True
                }
            ),
            # markdown_generator=DefaultMarkdownGenerator(
-            #     content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0),
+            #     content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
            #     options={
            #         "ignore_links": True
            #     }