diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index dca61350..aadcda20 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -77,6 +77,9 @@ class LLMExtractionStrategy(ExtractionStrategy): self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) + self.apply_chunking = kwargs.get("apply_chunking", True) + if not self.apply_chunking: + self.chunk_token_threshold = 1e9 self.verbose = kwargs.get("verbose", False) diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py new file mode 100644 index 00000000..31098e8e --- /dev/null +++ b/docs/examples/summarize_page.py @@ -0,0 +1,46 @@ +import os +import time +import json +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import * +from crawl4ai.extraction_strategy import * +from crawl4ai.crawler_strategy import * + +url = r'https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot' + +crawler = WebCrawler() +crawler.warmup() + +from pydantic import BaseModel, Field + +class PageSummary(BaseModel): + title: str = Field(..., description="Title of the page.") + summary: str = Field(..., description="Summary of the page.") + brief_summary: str = Field(..., description="Brief summary of the page.") + keywords: list = Field(..., description="Keywords assigned to the page.") + +result = crawler.run( + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + schema=PageSummary.model_json_schema(), + extraction_type="schema", + apply_chunking =False, + instruction="From the crawled content, extract the following details: "\ + "1. Title of the page "\ + "2. Summary of the page, which is a detailed summary "\ + "3. Brief summary of the page, which is a paragraph text "\ + "4. Keywords assigned to the page, which is a list of keywords. "\ + 'The extracted JSON format should look like this: '\ + '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }' + ), + bypass_cache=True, +) + +page_summary = json.loads(result.extracted_content) + +print(page_summary) + +with open(".data/page_summary.json", "w") as f: + f.write(result.extracted_content)