- Test all methods

- Update index.hml - Update Readme - Resolve some bugs
2024-05-14 21:27:41 +08:00
parent 5fea6c064b
commit f6e59157bf
17 changed files with 1004 additions and 402 deletions
--- a/docs/quickstart.py
+++ b/docs/quickstart.py
@@ -0,0 +1,33 @@
+import os
+from crawl4ai.web_crawler import WebCrawler
+from crawl4ai.chunking_strategy import *
+from crawl4ai.extraction_strategy import *
+
+
+def main():
+    crawler = WebCrawler()
+    crawler.warmup()
+    
+    # Single page crawl
+    result = crawler.run(
+        url="https://www.nbcnews.com/business",
+        word_count_threshold=5,  # Minimum word count for a HTML tag to be considered as a worthy block
+        chunking_strategy=RegexChunking(patterns=["\n\n"]),  # Default is RegexChunking
+        extraction_strategy=CosineStrategy(
+            word_count_threshold=20, max_dist=0.2, linkage_method="ward", top_k=3
+        ),  # Default is CosineStrategy
+        # extraction_strategy= LLMExtractionStrategy(provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY')),
+        bypass_cache=True,
+        extract_blocks=True,  # Whether to extract semantical blocks of text from the HTML
+        css_selector="",  # Eg: "div.article-body" or all H2 tags liek "h2"
+        verbose=True,
+        include_raw_html=True,  # Whether to include the raw HTML content in the response
+    )
+    
+
+    print("[LOG] 📦 Crawl result:")
+    print(result.model_dump())
+
+
+if __name__ == "__main__":
+    main()