- Text Categorization
- Crawler, Extraction, and Chunking strategies
- Clustering for semantic segmentation
This commit is contained in:
unclecode
2024-05-12 22:37:21 +08:00
parent 7039e3c1ee
commit 82706129f5
19 changed files with 84568 additions and 102 deletions

View File

@@ -8,11 +8,12 @@ def main():
crawler = WebCrawler(db_path='crawler_data.db')
# Fetch a single page
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=False)
single_url = UrlModel(url='https://www.nbcnews.com/business', forced=True)
result = crawler.fetch_page(
single_url,
provider= "openai/gpt-3.5-turbo",
api_token = os.getenv('OPENAI_API_KEY'),
use_cached_html = True,
extract_blocks_flag=True,
word_count_threshold=10
)