Merge branch 'main' of https://github.com/unclecode/crawl4ai
This commit is contained in:
33
README.md
33
README.md
@@ -169,30 +169,15 @@ With Crawl4AI, you can perform advanced web crawling and data extraction tasks w
|
|||||||
|
|
||||||
## Table of Contents
|
## Table of Contents
|
||||||
|
|
||||||
1. [Features](#features-)
|
1. [Installation](#installation-)
|
||||||
2. [Installation](#installation-)
|
2. [REST API/Local Server](#using-the-local-server-ot-rest-api-)
|
||||||
3. [REST API/Local Server](#using-the-local-server-ot-rest-api-)
|
3. [Python Library Usage](#python-library-usage-)
|
||||||
4. [Python Library Usage](#python-library-usage-)
|
4. [Parameters](#parameters-)
|
||||||
5. [Parameters](#parameters-)
|
5. [Chunking Strategies](#chunking-strategies-)
|
||||||
6. [Chunking Strategies](#chunking-strategies-)
|
6. [Extraction Strategies](#extraction-strategies-)
|
||||||
7. [Extraction Strategies](#extraction-strategies-)
|
7. [Contributing](#contributing-)
|
||||||
8. [Contributing](#contributing-)
|
8. [License](#license-)
|
||||||
9. [License](#license-)
|
9. [Contact](#contact-)
|
||||||
10. [Contact](#contact-)
|
|
||||||
|
|
||||||
|
|
||||||
## Features ✨
|
|
||||||
|
|
||||||
- 🕷️ Efficient web crawling to extract valuable data from websites
|
|
||||||
- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown)
|
|
||||||
- 🌍 Supports crawling multiple URLs simultaneously
|
|
||||||
- 🌃 Replace media tags with ALT.
|
|
||||||
- 🆓 Completely free to use and open-source
|
|
||||||
- 📜 Execute custom JavaScript before crawling
|
|
||||||
- 📚 Chunking strategies: topic-based, regex, sentence, and more
|
|
||||||
- 🧠 Extraction strategies: cosine clustering, LLM, and more
|
|
||||||
- 🎯 CSS selector support
|
|
||||||
- 📝 Pass instructions/keywords to refine extraction
|
|
||||||
|
|
||||||
## Installation 💻
|
## Installation 💻
|
||||||
|
|
||||||
|
|||||||
@@ -77,6 +77,9 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
|
self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
|
||||||
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
||||||
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
||||||
|
self.apply_chunking = kwargs.get("apply_chunking", True)
|
||||||
|
if not self.apply_chunking:
|
||||||
|
self.chunk_token_threshold = 1e9
|
||||||
|
|
||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = kwargs.get("verbose", False)
|
||||||
|
|
||||||
|
|||||||
46
docs/examples/summarize_page.py
Normal file
46
docs/examples/summarize_page.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
import os
|
||||||
|
import time
|
||||||
|
import json
|
||||||
|
from crawl4ai.web_crawler import WebCrawler
|
||||||
|
from crawl4ai.chunking_strategy import *
|
||||||
|
from crawl4ai.extraction_strategy import *
|
||||||
|
from crawl4ai.crawler_strategy import *
|
||||||
|
|
||||||
|
url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot'
|
||||||
|
|
||||||
|
crawler = WebCrawler()
|
||||||
|
crawler.warmup()
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
class PageSummary(BaseModel):
|
||||||
|
title: str = Field(..., description="Title of the page.")
|
||||||
|
summary: str = Field(..., description="Summary of the page.")
|
||||||
|
brief_summary: str = Field(..., description="Brief summary of the page.")
|
||||||
|
keywords: list = Field(..., description="Keywords assigned to the page.")
|
||||||
|
|
||||||
|
result = crawler.run(
|
||||||
|
url=url,
|
||||||
|
word_count_threshold=1,
|
||||||
|
extraction_strategy= LLMExtractionStrategy(
|
||||||
|
provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||||
|
schema=PageSummary.model_json_schema(),
|
||||||
|
extraction_type="schema",
|
||||||
|
apply_chunking =False,
|
||||||
|
instruction="From the crawled content, extract the following details: "\
|
||||||
|
"1. Title of the page "\
|
||||||
|
"2. Summary of the page, which is a detailed summary "\
|
||||||
|
"3. Brief summary of the page, which is a paragraph text "\
|
||||||
|
"4. Keywords assigned to the page, which is a list of keywords. "\
|
||||||
|
'The extracted JSON format should look like this: '\
|
||||||
|
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }'
|
||||||
|
),
|
||||||
|
bypass_cache=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
page_summary = json.loads(result.extracted_content)
|
||||||
|
|
||||||
|
print(page_summary)
|
||||||
|
|
||||||
|
with open(".data/page_summary.json", "w") as f:
|
||||||
|
f.write(result.extracted_content)
|
||||||
Reference in New Issue
Block a user