From 1fcb573909364afdf751f5caaa85bc1fbc0bde87 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 19 Jun 2024 18:53:22 +0800 Subject: [PATCH 1/2] chore: Update table of contents in README.md --- README.md | 33 +++++++++------------------------ 1 file changed, 9 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 1e9ccb87..0e2f4fd5 100644 --- a/README.md +++ b/README.md @@ -169,30 +169,15 @@ With Crawl4AI, you can perform advanced web crawling and data extraction tasks w ## Table of Contents -1. [Features](#features-) -2. [Installation](#installation-) -3. [REST API/Local Server](#using-the-local-server-ot-rest-api-) -4. [Python Library Usage](#python-library-usage-) -5. [Parameters](#parameters-) -6. [Chunking Strategies](#chunking-strategies-) -7. [Extraction Strategies](#extraction-strategies-) -8. [Contributing](#contributing-) -9. [License](#license-) -10. [Contact](#contact-) - - -## Features ✨ - -- 🕷️ Efficient web crawling to extract valuable data from websites -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌍 Supports crawling multiple URLs simultaneously -- 🌃 Replace media tags with ALT. -- 🆓 Completely free to use and open-source -- 📜 Execute custom JavaScript before crawling -- 📚 Chunking strategies: topic-based, regex, sentence, and more -- 🧠 Extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support -- 📝 Pass instructions/keywords to refine extraction +1. [Installation](#installation-) +2. [REST API/Local Server](#using-the-local-server-ot-rest-api-) +3. [Python Library Usage](#python-library-usage-) +4. [Parameters](#parameters-) +5. [Chunking Strategies](#chunking-strategies-) +6. [Extraction Strategies](#extraction-strategies-) +7. [Contributing](#contributing-) +8. [License](#license-) +9. [Contact](#contact-) ## Installation 💻 From 21b110bfd72682c229ce4814a13ad076b7229602 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 19 Jun 2024 19:03:35 +0800 Subject: [PATCH 2/2] Update LLMExtractionStrategy to disable chunking if specified, Add example of summarization for a web page. --- crawl4ai/extraction_strategy.py | 3 +++ docs/examples/summarize_page.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 docs/examples/summarize_page.py diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index dca61350..aadcda20 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -77,6 +77,9 @@ class LLMExtractionStrategy(ExtractionStrategy): self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) + self.apply_chunking = kwargs.get("apply_chunking", True) + if not self.apply_chunking: + self.chunk_token_threshold = 1e9 self.verbose = kwargs.get("verbose", False) diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py new file mode 100644 index 00000000..31098e8e --- /dev/null +++ b/docs/examples/summarize_page.py @@ -0,0 +1,46 @@ +import os +import time +import json +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import * +from crawl4ai.extraction_strategy import * +from crawl4ai.crawler_strategy import * + +url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot' + +crawler = WebCrawler() +crawler.warmup() + +from pydantic import BaseModel, Field + +class PageSummary(BaseModel): + title: str = Field(..., description="Title of the page.") + summary: str = Field(..., description="Summary of the page.") + brief_summary: str = Field(..., description="Brief summary of the page.") + keywords: list = Field(..., description="Keywords assigned to the page.") + +result = crawler.run( + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + schema=PageSummary.model_json_schema(), + extraction_type="schema", + apply_chunking =False, + instruction="From the crawled content, extract the following details: "\ + "1. Title of the page "\ + "2. Summary of the page, which is a detailed summary "\ + "3. Brief summary of the page, which is a paragraph text "\ + "4. Keywords assigned to the page, which is a list of keywords. "\ + 'The extracted JSON format should look like this: '\ + '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }' + ), + bypass_cache=True, +) + +page_summary = json.loads(result.extracted_content) + +print(page_summary) + +with open(".data/page_summary.json", "w") as f: + f.write(result.extracted_content)