chore: Update web crawler URLs to use NBC News business section

This commit is contained in:
unclecode
2024-05-17 18:11:13 +08:00
parent 36e46be23d
commit 957a2458b1
3 changed files with 17 additions and 17 deletions

View File

@@ -31,7 +31,7 @@ from crawl4ai import WebCrawler
crawler = WebCrawler() crawler = WebCrawler()
# Run the crawler with keyword filtering and CSS selector # Run the crawler with keyword filtering and CSS selector
result = crawler.run(url="https://www.example.com") result = crawler.run(url="https://www.nbcnews.com/business")
print(result) # {url, html, markdown, extracted_content, metadata} print(result) # {url, html, markdown, extracted_content, metadata}
``` ```
@@ -63,7 +63,7 @@ crawler = WebCrawler(crawler_strategy=crawler_strategy)
# Run the crawler with keyword filtering and CSS selector # Run the crawler with keyword filtering and CSS selector
result = crawler.run( result = crawler.run(
url="https://www.example.com", url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy( extraction_strategy=CosineStrategy(
semantic_filter="technology", semantic_filter="technology",
), ),
@@ -71,7 +71,7 @@ result = crawler.run(
# Run the crawler with LLM extraction strategy # Run the crawler with LLM extraction strategy
result = crawler.run( result = crawler.run(
url="https://www.example.com", url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o", provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'), api_token=os.getenv('OPENAI_API_KEY'),
@@ -93,16 +93,16 @@ With Crawl4AI, you can perform advanced web crawling and data extraction tasks w
## Table of Contents ## Table of Contents
1. [Features](#features) 1. [Features](#features-)
2. [Installation](#installation) 2. [Installation](#installation-)
3. [REST API/Local Server](#using-the-local-server-ot-rest-api) 3. [REST API/Local Server](#using-the-local-server-ot-rest-api-)
4. [Python Library Usage](#usage) 4. [Python Library Usage](#python-library-usage-)
5. [Parameters](#parameters) 5. [Parameters](#parameters-)
6. [Chunking Strategies](#chunking-strategies) 6. [Chunking Strategies](#chunking-strategies-)
7. [Extraction Strategies](#extraction-strategies) 7. [Extraction Strategies](#extraction-strategies-)
8. [Contributing](#contributing) 8. [Contributing](#contributing-)
9. [License](#license) 9. [License](#license-)
10. [Contact](#contact) 10. [Contact](#contact-)
## Features ✨ ## Features ✨
@@ -168,7 +168,7 @@ To use the REST API, send a POST request to `https://crawl4ai.com/crawl` with th
**Example Request:** **Example Request:**
```json ```json
{ {
"urls": ["https://www.example.com"], "urls": ["https://www.nbcnews.com/business"],
"include_raw_html": false, "include_raw_html": false,
"bypass_cache": true, "bypass_cache": true,
"word_count_threshold": 5, "word_count_threshold": 5,
@@ -195,7 +195,7 @@ To use the REST API, send a POST request to `https://crawl4ai.com/crawl` with th
"status": "success", "status": "success",
"data": [ "data": [
{ {
"url": "https://www.example.com", "url": "https://www.nbcnews.com/business",
"extracted_content": "...", "extracted_content": "...",
"html": "...", "html": "...",
"markdown": "...", "markdown": "...",

View File

@@ -128,7 +128,7 @@ def add_llm_extraction_strategy(crawler):
print_result(result) print_result(result)
result = crawler.run( result = crawler.run(
url="https://www.example.com", url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o", provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'), api_token=os.getenv('OPENAI_API_KEY'),

View File

@@ -12,7 +12,7 @@ requirements_without_torch_transformers_nlkt = [req for req in requirements if n
setup( setup(
name="Crawl4AI", name="Crawl4AI",
version="0.1.2", version="0.2.0",
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
long_description=open("README.md").read(), long_description=open("README.md").read(),
long_description_content_type="text/markdown", long_description_content_type="text/markdown",