refactor(crawler):

- Update hello_world example with proper content filtering
2025-01-01 19:39:42 +08:00
parent a96e05d4ae
commit aa4f92f458
1 changed files with 25 additions and 0 deletions
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -0,0 +1,25 @@
+import os, sys
+
+sys.path.append(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+from crawl4ai import *
+
+async def main():
+    async with AsyncWebCrawler() as crawler:
+        crawler_config = CrawlerRunConfig(
+            cache_mode=CacheMode.BYPASS,
+            markdown_generator=DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
+            )
+        )
+        result = await crawler.arun(
+            url="https://www.nbcnews.com/business",
+            config=crawler_config
+        )
+        print(result.markdown_v2.raw_markdown[:500])
+
+if __name__ == "__main__":
+    asyncio.run(main())