chore: Update web crawler URLs to use NBC News business section
This commit is contained in:
30
README.md
30
README.md
@@ -31,7 +31,7 @@ from crawl4ai import WebCrawler
|
|||||||
crawler = WebCrawler()
|
crawler = WebCrawler()
|
||||||
|
|
||||||
# Run the crawler with keyword filtering and CSS selector
|
# Run the crawler with keyword filtering and CSS selector
|
||||||
result = crawler.run(url="https://www.example.com")
|
result = crawler.run(url="https://www.nbcnews.com/business")
|
||||||
print(result) # {url, html, markdown, extracted_content, metadata}
|
print(result) # {url, html, markdown, extracted_content, metadata}
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -63,7 +63,7 @@ crawler = WebCrawler(crawler_strategy=crawler_strategy)
|
|||||||
|
|
||||||
# Run the crawler with keyword filtering and CSS selector
|
# Run the crawler with keyword filtering and CSS selector
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.example.com",
|
url="https://www.nbcnews.com/business",
|
||||||
extraction_strategy=CosineStrategy(
|
extraction_strategy=CosineStrategy(
|
||||||
semantic_filter="technology",
|
semantic_filter="technology",
|
||||||
),
|
),
|
||||||
@@ -71,7 +71,7 @@ result = crawler.run(
|
|||||||
|
|
||||||
# Run the crawler with LLM extraction strategy
|
# Run the crawler with LLM extraction strategy
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.example.com",
|
url="https://www.nbcnews.com/business",
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
provider="openai/gpt-4o",
|
provider="openai/gpt-4o",
|
||||||
api_token=os.getenv('OPENAI_API_KEY'),
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
@@ -93,16 +93,16 @@ With Crawl4AI, you can perform advanced web crawling and data extraction tasks w
|
|||||||
|
|
||||||
## Table of Contents
|
## Table of Contents
|
||||||
|
|
||||||
1. [Features](#features)
|
1. [Features](#features-)
|
||||||
2. [Installation](#installation)
|
2. [Installation](#installation-)
|
||||||
3. [REST API/Local Server](#using-the-local-server-ot-rest-api)
|
3. [REST API/Local Server](#using-the-local-server-ot-rest-api-)
|
||||||
4. [Python Library Usage](#usage)
|
4. [Python Library Usage](#python-library-usage-)
|
||||||
5. [Parameters](#parameters)
|
5. [Parameters](#parameters-)
|
||||||
6. [Chunking Strategies](#chunking-strategies)
|
6. [Chunking Strategies](#chunking-strategies-)
|
||||||
7. [Extraction Strategies](#extraction-strategies)
|
7. [Extraction Strategies](#extraction-strategies-)
|
||||||
8. [Contributing](#contributing)
|
8. [Contributing](#contributing-)
|
||||||
9. [License](#license)
|
9. [License](#license-)
|
||||||
10. [Contact](#contact)
|
10. [Contact](#contact-)
|
||||||
|
|
||||||
|
|
||||||
## Features ✨
|
## Features ✨
|
||||||
@@ -168,7 +168,7 @@ To use the REST API, send a POST request to `https://crawl4ai.com/crawl` with th
|
|||||||
**Example Request:**
|
**Example Request:**
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"urls": ["https://www.example.com"],
|
"urls": ["https://www.nbcnews.com/business"],
|
||||||
"include_raw_html": false,
|
"include_raw_html": false,
|
||||||
"bypass_cache": true,
|
"bypass_cache": true,
|
||||||
"word_count_threshold": 5,
|
"word_count_threshold": 5,
|
||||||
@@ -195,7 +195,7 @@ To use the REST API, send a POST request to `https://crawl4ai.com/crawl` with th
|
|||||||
"status": "success",
|
"status": "success",
|
||||||
"data": [
|
"data": [
|
||||||
{
|
{
|
||||||
"url": "https://www.example.com",
|
"url": "https://www.nbcnews.com/business",
|
||||||
"extracted_content": "...",
|
"extracted_content": "...",
|
||||||
"html": "...",
|
"html": "...",
|
||||||
"markdown": "...",
|
"markdown": "...",
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ def add_llm_extraction_strategy(crawler):
|
|||||||
print_result(result)
|
print_result(result)
|
||||||
|
|
||||||
result = crawler.run(
|
result = crawler.run(
|
||||||
url="https://www.example.com",
|
url="https://www.nbcnews.com/business",
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
provider="openai/gpt-4o",
|
provider="openai/gpt-4o",
|
||||||
api_token=os.getenv('OPENAI_API_KEY'),
|
api_token=os.getenv('OPENAI_API_KEY'),
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -12,7 +12,7 @@ requirements_without_torch_transformers_nlkt = [req for req in requirements if n
|
|||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="Crawl4AI",
|
name="Crawl4AI",
|
||||||
version="0.1.2",
|
version="0.2.0",
|
||||||
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper",
|
||||||
long_description=open("README.md").read(),
|
long_description=open("README.md").read(),
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
Reference in New Issue
Block a user