diff --git a/CHANGELOG.md b/CHANGELOG.md index eb854b1d..70d2bb40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ - before_return_html: Called when the data is parsed and ready. - on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. - Added an example in `quickstart.py` in the example folder under the docs. +- Enhancement issue #24: Replaced inline HTML tags (e.g., DEL, INS, SUB, ABBR) with textual format for better context handling in LLM. +- Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. +- Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!). ## [0.2.4] - 2024-06-17 ### Fixed diff --git a/Dockerfile b/Dockerfile index 264d4159..54cf746e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,5 @@ - # First stage: Build and install dependencies -FROM python:3.10-slim-bookworm as builder +FROM python:3.10-slim-bookworm # Set the working directory in the container WORKDIR /usr/src/app @@ -9,51 +8,30 @@ WORKDIR /usr/src/app RUN apt-get update && \ apt-get install -y --no-install-recommends \ wget \ + git \ curl \ - unzip + unzip \ + gnupg \ + xvfb \ + ca-certificates \ + apt-transport-https \ + software-properties-common && \ + rm -rf /var/lib/apt/lists/* # Install Python dependencies COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt && \ - pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \ + pip install --no-cache-dir spacy torch onnxruntime uvicorn && \ python -m spacy download en_core_web_sm + # pip install --no-cache-dir spacy torch torchvision torchaudio onnxruntime uvicorn && \ -# Download and install ChromeDriver -RUN CHROMEDRIVER_VERSION=$(curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE) && \ - wget -N https://chromedriver.storage.googleapis.com/$CHROMEDRIVER_VERSION/chromedriver_linux64.zip -P /tmp && \ - unzip /tmp/chromedriver_linux64.zip -d /tmp && \ - mv /tmp/chromedriver /usr/local/bin/chromedriver && \ - chmod +x /usr/local/bin/chromedriver && \ - rm /tmp/chromedriver_linux64.zip - -# Second stage: Create final runtime image -FROM python:3.10-slim-bookworm - -# Set the working directory in the container -WORKDIR /usr/src/app - -# Install runtime dependencies -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - wget \ - git \ - xvfb \ - gnupg2 \ - ca-certificates \ - apt-transport-https \ - software-properties-common && \ - wget -q -O - https://dl.google.com/linux/linux_signing_key.pub | apt-key add - && \ - echo "deb http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list && \ +# Install Google Chrome and ChromeDriver +RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ + sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google-chrome.list' && \ apt-get update && \ - apt-get install -y --no-install-recommends google-chrome-stable && \ - rm -rf /var/lib/apt/lists/* /etc/apt/sources.list.d/google-chrome.list - -# Copy Chromedriver from the builder stage -COPY --from=builder /usr/local/bin/chromedriver /usr/local/bin/chromedriver - -# Copy installed Python packages from builder stage -COPY --from=builder /usr/local/lib/python3.10/site-packages /usr/local/lib/python3.10/site-packages -COPY --from=builder /usr/local/bin /usr/local/bin + apt-get install -y google-chrome-stable && \ + wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip && \ + unzip /tmp/chromedriver.zip chromedriver -d /usr/local/bin/ # Copy the rest of the application code COPY . . @@ -65,12 +43,19 @@ ENV CHROME_BIN=/usr/bin/google-chrome \ DBUS_SESSION_BUS_ADDRESS=/dev/null \ PYTHONUNBUFFERED=1 +# pip install -e .[all] +RUN pip install --no-cache-dir -e .[all] + # Ensure the PATH environment variable includes the location of the installed packages -ENV PATH /usr/local/bin:$PATH +ENV PATH /opt/conda/bin:$PATH # Make port 80 available to the world outside this container EXPOSE 80 +# Download models call cli "crawl4ai-download-models" +RUN crawl4ai-download-models +# RUN python crawl4ai/model_loader.py + # Run uvicorn CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"] diff --git a/Dockerfile-version-0 b/Dockerfile-version-0 deleted file mode 100644 index 4c86b882..00000000 --- a/Dockerfile-version-0 +++ /dev/null @@ -1,45 +0,0 @@ -# Use an official Python runtime as a parent image -FROM python:3.10-slim -# In case you had some weird issues, try this Image -# FROM python:3.10-slim-bookworm as builder - -# Set the working directory in the container -WORKDIR /usr/src/app - -# Copy the current directory contents into the container at /usr/src/app -COPY . . - -# Install dependencies for Chrome and ChromeDriver -RUN apt-get update && apt-get install -y --no-install-recommends \ - wget \ - xvfb \ - unzip \ - curl \ - gnupg2 \ - ca-certificates \ - apt-transport-https \ - software-properties-common \ - && mkdir -p /etc/apt/keyrings \ - && curl -fsSL https://dl-ssl.google.com/linux/linux_signing_key.pub | gpg --dearmor -o /etc/apt/keyrings/google-linux-signing-keyring.gpg \ - && echo 'deb [arch=amd64 signed-by=/etc/apt/keyrings/google-linux-signing-keyring.gpg] http://dl.google.com/linux/chrome/deb/ stable main' | tee /etc/apt/sources.list.d/google-chrome.list \ - && apt-get update \ - && apt-get install -y google-chrome-stable \ - && rm -rf /var/lib/apt/lists/* \ - && apt-get install -y chromium-chromedriver - -# Install Python dependencies -RUN pip install --no-cache-dir -r requirements.txt -RUN pip install spacy torch torchvision torchaudio - -# Set display port and dbus env to avoid hanging -ENV DISPLAY=:99 -ENV DBUS_SESSION_BUS_ADDRESS=/dev/null - -# Make port 80 available to the world outside this container -EXPOSE 80 - -# Define environment variable -ENV PYTHONUNBUFFERED 1 - -# Run uvicorn -CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "80", "--workers", "4"] diff --git a/README.md b/README.md index b0b12510..1e9ccb87 100644 --- a/README.md +++ b/README.md @@ -21,33 +21,29 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information - 🟠 before_return_html: Called when the data is parsed and ready. - 🟑 on_user_agent_updated: Called when the user changes the user_agent, causing the driver to reinitialize. - πŸ“„ Added an example in [`quickstart.py`](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) in the example folder under the docs. +- ✨ Maintaining the semantic context of inline tags (e.g., abbreviation, DEL, INS) for improved LLM-friendliness. +- 🐳 Updated Dockerfile to ensure compatibility across multiple platforms (Hopefully!). -### v0.2.4 -- 🐞 Resolve the issue with the long url. (Issue #22) +Check the [Changelog](https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md) for more details. -### v0.2.3 -- 🎨 Extract and return all media tags (Images, Audio, and Video). Check `result.media` -- πŸ”— Extrat all external and internal links. Check `result.links` -- πŸ“š Extract metadata from the page. Check `result.metadata` + +## Features ✨ +- πŸ†“ Completely free to use and open-source (If one can assume this as a feature ;)) +- πŸ€– LLM-friendly output formats (JSON, cleaned HTML, markdown) +- 🌍 Supports crawling multiple URLs simultaneously +- 🎨 Extract and return all media tags (Images, Audio, and Video). +- πŸ”— Extrat all external and internal links. +- πŸ“š Extract metadata from the page. +- πŸ”„ Custom hooks for authentication, headers, and page modifications before crawling - πŸ•΅οΈ Support `user_agent` parameter to set the user agent for the HTTP requests. - πŸ–ΌοΈ Take [screenshots](#taking-screenshots) of the page. - -### v0.2.2 -- Support multiple JS scripts -- Fixed some of bugs -- Resolved a few issue relevant to Colab installation - -### v0.2.0 -- πŸš€ 10x faster!! -- πŸ“œ Execute custom JavaScript before crawling! -- 🀝 Colab friendly! -- πŸ“š Chunking strategies: topic-based, regex, sentence, and more! -- 🧠 Extraction strategies: cosine clustering, LLM, and more! +- πŸ“œ Execute multiple custom JavaScripts before crawling +- πŸ“š Chunking strategies: topic-based, regex, sentence, and more +- 🧠 Extraction strategies: cosine clustering, LLM, and more - 🎯 CSS selector support - πŸ“ Pass instructions/keywords to refine extraction ## Power and Simplicity of Crawl4AI πŸš€ - The most easy way! If you don't want to install any library, you can use the REST API on my server. But remember, this is just a simple server. I may improve its capacity if I see there is demand. You can find ll examples of REST API in this colab notebook. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zODYjhemJ5bUmYceWpVoBMVpd0ofzNBZ?usp=sharing) ```python @@ -81,6 +77,53 @@ result = crawler.run(url="https://www.nbcnews.com/business") print(result) # {url, html, cleaned_html, markdown, media, links, extracted_content, metadata, screenshots} ``` +### Extract with LLM +Next example is crawling all OpenAI models withh their fees from the official page. ['OpenAI Models and Pricing'](https://openai.com/api/pricing/) + +```python +import os +import time +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import * +from crawl4ai.extraction_strategy import * +from crawl4ai.crawler_strategy import * + +url = r'https://openai.com/api/pricing/' + +crawler = WebCrawler() +crawler.warmup() + +from pydantic import BaseModel, Field + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + +result = crawler.run( + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="From the crawled content, extract all mentioned model names along with their "\ + "fees for input and output tokens. Make sure not to miss anything in the entire content. "\ + 'One extracted model JSON format should look like this: '\ + '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' + ), + bypass_cache=True, +) + +model_fees = json.loads(result.extracted_content) + +print(len(model_fees)) + +with open(".data/data.json", "w") as f: + f.write(result.extracted_content) +``` + +## Execute JS, Filter Data with CSS Selector, and Clustring using Cosine Strategy Now let's try a complex task. Below is an example of how you can execute JavaScript, filter data using keywords, and use a CSS selector to extract specific contentβ€”all in one go! 1. Instantiate a WebCrawler object. @@ -107,23 +150,12 @@ crawler.warmup() result = crawler.run( url="https://www.nbcnews.com/business", js = js_code, + css_selector="p" extraction_strategy=CosineStrategy( semantic_filter="technology", ), ) -# Run the crawler with LLM extraction strategy -result = crawler.run( - url="https://www.nbcnews.com/business", - js = js_code, - extraction_strategy=LLMExtractionStrategy( - provider="openai/gpt-4o", - api_token=os.getenv('OPENAI_API_KEY'), - instruction="Extract only content related to technology" - ), - css_selector="p" -) - # Display the extracted result print(result) ``` diff --git a/crawl4ai/config.py b/crawl4ai/config.py index a20eb547..77273b78 100644 --- a/crawl4ai/config.py +++ b/crawl4ai/config.py @@ -21,7 +21,9 @@ PROVIDER_MODELS = { # Chunk token threshold -CHUNK_TOKEN_THRESHOLD = 1000 +CHUNK_TOKEN_THRESHOLD = 500 +OVERLAP_RATE = 0.1 +WORD_TOKEN_RATE = 1.3 # Threshold for the minimum number of word in a HTML tag to be considered -MIN_WORD_THRESHOLD = 5 +MIN_WORD_THRESHOLD = 1 diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index ecf0863a..9e85d60d 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -79,8 +79,15 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): self.options.headless = True if kwargs.get("user_agent"): self.options.add_argument("--user-agent=" + kwargs.get("user_agent")) + else: + # Set user agent + user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + self.options.add_argument(f"--user-agent={user_agent}") + self.options.add_argument("--no-sandbox") - self.options.add_argument("--headless") + self.options.headless = kwargs.get("headless", True) + if self.options.headless: + self.options.add_argument("--headless") # self.options.add_argument("--disable-dev-shm-usage") self.options.add_argument("--disable-gpu") # self.options.add_argument("--disable-extensions") @@ -112,10 +119,19 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): # chromedriver_autoinstaller.install() import chromedriver_autoinstaller - self.service = Service(chromedriver_autoinstaller.install()) + crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") + chromedriver_path = chromedriver_autoinstaller.utils.download_chromedriver(crawl4ai_folder, False) + # self.service = Service(chromedriver_autoinstaller.install()) + self.service = Service(chromedriver_path) self.service.log_path = "NUL" self.driver = webdriver.Chrome(service=self.service, options=self.options) self.driver = self.execute_hook('on_driver_created', self.driver) + + if kwargs.get("cookies"): + for cookie in kwargs.get("cookies"): + self.driver.add_cookie(cookie) + + def set_hook(self, hook_type: str, hook: Callable): if hook_type in self.hooks: diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index a24b5fe5..dca61350 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -3,12 +3,12 @@ from typing import Any, List, Dict, Optional, Union from concurrent.futures import ThreadPoolExecutor, as_completed import json, time # from optimum.intel import IPEXModel -from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION +from .prompts import * from .config import * from .utils import * from functools import partial from .model_loader import * - +import math import numpy as np class ExtractionStrategy(ABC): @@ -55,7 +55,9 @@ class NoExtractionStrategy(ExtractionStrategy): return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] class LLMExtractionStrategy(ExtractionStrategy): - def __init__(self, provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, instruction:str = None, **kwargs): + def __init__(self, + provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, + instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): """ Initialize the strategy with clustering parameters. @@ -67,6 +69,15 @@ class LLMExtractionStrategy(ExtractionStrategy): self.provider = provider self.api_token = api_token or PROVIDER_MODELS.get(provider, None) or os.getenv("OPENAI_API_KEY") self.instruction = instruction + self.extract_type = extraction_type + self.schema = schema + if schema: + self.extract_type = "schema" + + self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) + self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) + self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) + self.verbose = kwargs.get("verbose", False) if not self.api_token: @@ -81,10 +92,15 @@ class LLMExtractionStrategy(ExtractionStrategy): "HTML": escape_json_string(sanitize_html(html)), } + prompt_with_variables = PROMPT_EXTRACT_BLOCKS if self.instruction: variable_values["REQUEST"] = self.instruction + prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION + + if self.extract_type == "schema": + variable_values["SCHEMA"] = json.dumps(self.schema) + prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION - prompt_with_variables = PROMPT_EXTRACT_BLOCKS if not self.instruction else PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION for variable in variable_values: prompt_with_variables = prompt_with_variables.replace( "{" + variable + "}", variable_values[variable] @@ -112,32 +128,62 @@ class LLMExtractionStrategy(ExtractionStrategy): print("[LOG] Extracted", len(blocks), "blocks from URL:", url, "block index:", ix) return blocks - def _merge(self, documents): + def _merge(self, documents, chunk_token_threshold, overlap): chunks = [] sections = [] + total_tokens = 0 + + # Calculate the total tokens across all documents + for document in documents: + total_tokens += len(document.split(' ')) * self.word_token_rate + + # Calculate the number of sections needed + num_sections = math.floor(total_tokens / chunk_token_threshold) + if num_sections < 1: + num_sections = 1 # Ensure there is at least one section + adjusted_chunk_threshold = total_tokens / num_sections + total_token_so_far = 0 + current_chunk = [] for document in documents: - if total_token_so_far < CHUNK_TOKEN_THRESHOLD: - chunk = document.split(' ') - total_token_so_far += len(chunk) * 1.3 - chunks.append(document) - else: - sections.append('\n\n'.join(chunks)) - chunks = [document] - total_token_so_far = len(document.split(' ')) * 1.3 - - if chunks: - sections.append('\n\n'.join(chunks)) + tokens = document.split(' ') + token_count = len(tokens) * self.word_token_rate - return sections + if total_token_so_far + token_count <= adjusted_chunk_threshold: + current_chunk.extend(tokens) + total_token_so_far += token_count + else: + # Ensure to handle the last section properly + if len(sections) == num_sections - 1: + current_chunk.extend(tokens) + continue + + # Add overlap if specified + if overlap > 0 and current_chunk: + overlap_tokens = current_chunk[-overlap:] + current_chunk.extend(overlap_tokens) + + sections.append(' '.join(current_chunk)) + current_chunk = tokens + total_token_so_far = token_count + + # Add the last chunk + if current_chunk: + sections.append(' '.join(current_chunk)) + + return sections + def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: """ Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. """ - merged_sections = self._merge(sections) + merged_sections = self._merge( + sections, self.chunk_token_threshold, + overlap= int(self.chunk_token_threshold * self.overlap_rate) + ) extracted_content = [] if self.provider.startswith("groq/"): # Sequential processing with a delay diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index e0498ccc..39de7e3b 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -164,4 +164,35 @@ Please provide your output within tags, like this: **Make sure to follow the user instruction to extract blocks aligin with the instruction.** -Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order.""" \ No newline at end of file +Remember, the output should be a complete, parsable JSON wrapped in tags, with no omissions or errors. The JSON objects should semantically break down the content into relevant blocks, maintaining the original order.""" + +PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION = """Here is the content from the URL: +{URL} + + +{HTML} + + +The user has made the following request for what information to extract from the above content: + + +{REQUEST} + + + +{SCHEMA} + + +Please carefully read the URL content and the user's request. If the user provided a desired JSON schema in the above, extract the requested information from the URL content according to that schema. If no schema was provided, infer an appropriate JSON schema based on the user's request that will best capture the key information they are looking for. + +Extraction instructions: +Return the extracted information as a list of JSON objects, with each object in the list corresponding to a block of content from the URL, in the same order as it appears on the page. Wrap the entire JSON list in tags. + +Quality Reflection: +Before outputting your final answer, double check that the JSON you are returning is complete, containing all the information requested by the user, and is valid JSON that could be parsed by json.loads() with no errors or omissions. The outputted JSON objects should fully match the schema, either provided or inferred. + +Quality Score: +After reflecting, score the quality and completeness of the JSON data you are about to return on a scale of 1 to 5. Write the score inside tags. + +Result +Output the final list of JSON objects, wrapped in tags.""" \ No newline at end of file diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index f201ba0b..9892134f 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -151,7 +151,42 @@ class CustomHTML2Text(HTML2Text): super().handle_tag(tag, attrs, start) -def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None): +def replace_inline_tags(soup, tags, only_text=False): + tag_replacements = { + 'b': lambda tag: f"**{tag.text}**", + 'i': lambda tag: f"*{tag.text}*", + 'u': lambda tag: f"__{tag.text}__", + 'span': lambda tag: f"{tag.text}", + 'del': lambda tag: f"~~{tag.text}~~", + 'ins': lambda tag: f"++{tag.text}++", + 'sub': lambda tag: f"~{tag.text}~", + 'sup': lambda tag: f"^^{tag.text}^^", + 'strong': lambda tag: f"**{tag.text}**", + 'em': lambda tag: f"*{tag.text}*", + 'code': lambda tag: f"`{tag.text}`", + 'kbd': lambda tag: f"`{tag.text}`", + 'var': lambda tag: f"_{tag.text}_", + 's': lambda tag: f"~~{tag.text}~~", + 'q': lambda tag: f'"{tag.text}"', + 'abbr': lambda tag: f"{tag.text} ({tag.get('title', '')})", + 'cite': lambda tag: f"_{tag.text}_", + 'dfn': lambda tag: f"_{tag.text}_", + 'time': lambda tag: f"{tag.text}", + 'small': lambda tag: f"{tag.text}", + 'mark': lambda tag: f"=={tag.text}==" + } + + for tag_name in tags: + for tag in soup.find_all(tag_name): + if not only_text: + replacement_text = tag_replacements.get(tag_name, lambda t: t.text)(tag) + tag.replace_with(replacement_text) + else: + tag.replace_with(tag.text) + + return soup + +def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs): try: if not html: return None @@ -249,6 +284,13 @@ def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, # Replace all "pre" tags with their inner text body = replace_pre_tags_with_text(body) + + # Replace inline tags with their text content + body = replace_inline_tags( + body, + ['b', 'i', 'u', 'span', 'del', 'ins', 'sub', 'sup', 'strong', 'em', 'code', 'kbd', 'var', 's', 'q', 'abbr', 'cite', 'dfn', 'time', 'small', 'mark'], + only_text=kwargs.get('only_text', False) + ) # Recursively remove empty elements, their parent elements, and elements with word count below threshold def remove_empty_and_low_word_count_elements(node, word_count_threshold): diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index da44cc19..5dd4b9c0 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -42,7 +42,7 @@ class WebCrawler: def warmup(self): print("[LOG] 🌀️ Warming up the WebCrawler") result = self.run( - url='https://crawl4ai.uccode.io/', + url='https://google.com/', word_count_threshold=5, extraction_strategy= NoExtractionStrategy(), bypass_cache=False, @@ -176,7 +176,7 @@ class WebCrawler: t = time.time() # Extract content from HTML try: - result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector) + result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) metadata = extract_metadata(html) if result is None: raise ValueError(f"Failed to extract content from the website: {url}") diff --git a/docs/examples/llm_extraction_openai_pricing.py b/docs/examples/llm_extraction_openai_pricing.py new file mode 100644 index 00000000..c4c6943e --- /dev/null +++ b/docs/examples/llm_extraction_openai_pricing.py @@ -0,0 +1,40 @@ +import os +import time +from crawl4ai.web_crawler import WebCrawler +from crawl4ai.chunking_strategy import * +from crawl4ai.extraction_strategy import * +from crawl4ai.crawler_strategy import * + +url = r'https://openai.com/api/pricing/' + +crawler = WebCrawler() +crawler.warmup() + +from pydantic import BaseModel, Field + +class OpenAIModelFee(BaseModel): + model_name: str = Field(..., description="Name of the OpenAI model.") + input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") + output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") + +result = crawler.run( + url=url, + word_count_threshold=1, + extraction_strategy= LLMExtractionStrategy( + provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), + schema=OpenAIModelFee.model_json_schema(), + extraction_type="schema", + instruction="From the crawled content, extract all mentioned model names along with their "\ + "fees for input and output tokens. Make sure not to miss anything in the entire content. "\ + 'One extracted model JSON format should look like this: '\ + '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' + ), + bypass_cache=True, +) + +model_fees = json.loads(result.extracted_content) + +print(len(model_fees)) + +with open(".data/data.json", "w") as f: + f.write(result.extracted_content) \ No newline at end of file diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index a6139f0a..24486cc1 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -35,7 +35,13 @@ def cprint(message, press_any_key=False): def basic_usage(crawler): cprint("πŸ› οΈ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") - result = crawler.run(url="https://www.nbcnews.com/business") + result = crawler.run(url="https://www.nbcnews.com/business", only_text = True) + cprint("[LOG] πŸ“¦ [bold yellow]Basic crawl result:[/bold yellow]") + print_result(result) + +def basic_usage_some_params(crawler): + cprint("πŸ› οΈ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]") + result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True) cprint("[LOG] πŸ“¦ [bold yellow]Basic crawl result:[/bold yellow]") print_result(result) @@ -203,6 +209,9 @@ def using_crawler_hooks(crawler): driver.get('https://example.com/login') from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.common.by import By + from selenium.webdriver.support import expected_conditions as EC + WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.NAME, 'username')) ) @@ -257,7 +266,9 @@ def main(): crawler = create_crawler() + crawler.always_by_pass_cache = True basic_usage(crawler) + # basic_usage_some_params(crawler) understanding_parameters(crawler) crawler.always_by_pass_cache = True diff --git a/requirements.txt b/requirements.txt index 20f7a0e4..ee5be60a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,21 +1,22 @@ -aiohttp -aiosqlite -bs4 -fastapi -html2text -httpx -litellm -nltk -pydantic -python-dotenv -requests -rich -scikit-learn -selenium -uvicorn -transformers -chromedriver-autoinstaller -torch -onnxruntime -tokenizers -pillow \ No newline at end of file +numpy==1.25.0 +aiohttp==3.9.5 +aiosqlite==0.20.0 +beautifulsoup4==4.12.3 +fastapi==0.111.0 +html2text==2024.2.26 +httpx==0.27.0 +litellm==1.40.17 +nltk==3.8.1 +pydantic==2.7.4 +python-dotenv==1.0.1 +requests==2.32.3 +rich==13.7.1 +scikit-learn==1.5.0 +selenium==4.21.0 +uvicorn==0.30.1 +transformers==4.41.2 +chromedriver-autoinstaller==0.6.4 +torch==2.3.1 +onnxruntime==1.18.0 +tokenizers==0.19.1 +pillow==10.3.0 diff --git a/setup.py b/setup.py index 2d05e206..a368e95d 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,15 @@ from setuptools import setup, find_packages import os +import sys +from pathlib import Path import subprocess from setuptools.command.install import install +# Create the .crawl4ai folder in the user's home directory if it doesn't exist +crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai") +os.makedirs(crawl4ai_folder, exist_ok=True) +os.makedirs(f"{crawl4ai_folder}/cache", exist_ok=True) + # Read the requirements from requirements.txt with open("requirements.txt") as f: requirements = f.read().splitlines()