Update:

- Text Categorization - Crawler, Extraction, and Chunking strategies - Clustering for semantic segmentation
2024-05-12 22:37:21 +08:00
parent 7039e3c1ee
commit 82706129f5
19 changed files with 84568 additions and 102 deletions
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -8,11 +8,12 @@ def main():
    crawler = WebCrawler(db_path='crawler_data.db')

    # Fetch a single page
-    single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
+    single_url = UrlModel(url='https://www.nbcnews.com/business', forced=True)
    result = crawler.fetch_page(
        single_url, 
        provider= "openai/gpt-3.5-turbo", 
        api_token = os.getenv('OPENAI_API_KEY'), 
+        use_cached_html = True,
        extract_blocks_flag=True,
        word_count_threshold=10
    )