diff --git a/CHANGELOG.md b/CHANGELOG.md index df96beab..70d2bb40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,8 @@ - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. - Added an example in `quickstart.py` in the example folder under the docs. - Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM. - +- Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. +- Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!). ## [0.2.4] - 2024-06-17 ### Fixed diff --git a/README.md b/README.md index 23f8bdcb..987ec232 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,25 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information - ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. - 🐳 Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!). -### v0.2.4 +Check the [Changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for more details. + + +## Features ✨ +- πŸ†“ Completely free to use and open-source (If one can assume this as a feature ;)) +- πŸ€– LLM-friendly output formats (JSON, cleaned HTML, markdown) +- 🌍 Supports crawling multiple URLs simultaneously +- 🎨 Extract and return all media tags (Images, Audio, and Video). +- πŸ”— Extrat all external and internal links. +- πŸ“š Extract metadata from the page. +- πŸ”„ Custom hooks for authentication, headers, and page modifications before crawling +- πŸ•΅οΈ Support `user_agent` parameter to set the user agent for the HTTP requests. +- πŸ–ΌοΈ Take [screenshots](#taking-screenshots) of the page. +- πŸ“œ Execute multiple custom JavaScripts before crawling +- πŸ“š Chunking strategies: topic-based, regex, sentence, and more +- 🧠 Extraction strategies: cosine clustering, LLM, and more +- 🎯 CSS selector support +- πŸ“ Pass instructions/keywords to refine extraction + - 🐞 Resolve the issue with the long url. (Issue #22) ### v0.2.3 @@ -83,6 +101,53 @@ result = crawler.run(url="https://www.nbcnews.com/business") print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots} ``` +### Extract with LLM +Next example is crawling all OpenAI models withh their fees from the official page. ['OpenAI Models and Pricing'](https://openai.com/api/pricing/) + +```python +import os +import time +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import * +from crawl4ai.extraction_strategy import * +from crawl4ai.crawler_strategy import * + +url = r'https://openai.com/api/pricing/' + +crawler = WebCrawler() +crawler.warmup() + +from pydantic import BaseModel, Field + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + +result = crawler.run( + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="From the crawled content, extract all mentioned model names along with their "\ + "fees for input and output tokens. Make sure not to miss anything in the entire content. "\ + 'One extracted model JSON format should look like this: '\ + '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' + ), + bypass_cache=True, +) + +model_fees = json.loads(result.extracted_content) + +print(len(model_fees)) + +with open(".data/data.json", "w") as f: + f.write(result.extracted_content) +``` + +## Execute JS, Filter Data with CSS Selector, and Clustring using Cosine Strategy Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific contentβ€”all in one go! 1. Instantiate a WebCrawler object. @@ -109,23 +174,12 @@ crawler.warmup() result = crawler.run( url="https://www.nbcnews.com/business", js = js_code, + css_selector="p" extraction_strategy=CosineStrategy( semantic_filter="technology", ), ) -# Run the crawler with LLM extraction strategy -result = crawler.run( - url="https://www.nbcnews.com/business", - js = js_code, - extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token=os.getenv('OPENAI_API_KEY'), - instruction="Extract only content related to technology" - ), - css_selector="p" -) - # Display the extracted result print(result) ``` diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index d93a3487..dca61350 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -71,6 +71,8 @@ class LLMExtractionStrategy(ExtractionStrategy): self.instruction = instruction self.extract_type = extraction_type self.schema = schema + if schema: + self.extract_type = "schema" self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)