Compare commits

...

7 Commits

Author SHA1 Message Date
UncleCode
494ee32619 Update README.md 2024-12-30 21:24:30 +08:00
UncleCode
e4e23065f1 Update README.md (#389) 2024-12-30 21:24:06 +08:00
Robin Singh
78768fd714 Update simple-crawling.md (#379)
In the comprehensive example,

AttributeError: type object 'CacheMode' has no attribute 'ENABLE'. Did you mean: 'ENABLED'?
2024-12-27 17:42:59 +08:00
Haopeng138
bacbeb3ed4 Fix #340 example llm_extraction (#358)
@Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.
2024-12-24 19:56:07 +08:00
UncleCode
ed7bc1909c Bump version to 0.4.22 2024-12-15 19:49:38 +08:00
UncleCode
e9e5b5642d Fix js_snipprt issue 0.4.21
bump to 0.4.22
2024-12-15 19:49:30 +08:00
UncleCode
7524aa7b5e Feature: Add Markdown generation to CrawlerRunConfig
- Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`.
  - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`.
  - Updated version number to 0.4.21 in `__version__.py`.
2024-12-13 21:51:38 +08:00
11 changed files with 87 additions and 65 deletions

View File

@@ -1 +1,2 @@
include requirements.txt include requirements.txt
recursive-include crawl4ai/js_snippet *.js

View File

@@ -1,4 +1,4 @@
# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI. # 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a> <a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.4.2" __version__ = "0.4.22"

View File

@@ -7,6 +7,7 @@ from .config import (
from .user_agent_generator import UserAgentGenerator from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy from .chunking_strategy import ChunkingStrategy
from .markdown_generation_strategy import MarkdownGenerationStrategy
class BrowserConfig: class BrowserConfig:
""" """
@@ -269,6 +270,7 @@ class CrawlerRunConfig:
word_count_threshold: int = MIN_WORD_THRESHOLD , word_count_threshold: int = MIN_WORD_THRESHOLD ,
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
markdown_generator : MarkdownGenerationStrategy = None,
content_filter=None, content_filter=None,
cache_mode=None, cache_mode=None,
session_id: str = None, session_id: str = None,
@@ -309,6 +311,7 @@ class CrawlerRunConfig:
self.word_count_threshold = word_count_threshold self.word_count_threshold = word_count_threshold
self.extraction_strategy = extraction_strategy self.extraction_strategy = extraction_strategy
self.chunking_strategy = chunking_strategy self.chunking_strategy = chunking_strategy
self.markdown_generator = markdown_generator
self.content_filter = content_filter self.content_filter = content_filter
self.cache_mode = cache_mode self.cache_mode = cache_mode
self.session_id = session_id self.session_id = session_id
@@ -364,6 +367,7 @@ class CrawlerRunConfig:
word_count_threshold=kwargs.get("word_count_threshold", 200), word_count_threshold=kwargs.get("word_count_threshold", 200),
extraction_strategy=kwargs.get("extraction_strategy"), extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy"), chunking_strategy=kwargs.get("chunking_strategy"),
markdown_generator=kwargs.get("markdown_generator"),
content_filter=kwargs.get("content_filter"), content_filter=kwargs.get("content_filter"),
cache_mode=kwargs.get("cache_mode"), cache_mode=kwargs.get("cache_mode"),
session_id=kwargs.get("session_id"), session_id=kwargs.get("session_id"),

View File

@@ -7,7 +7,8 @@ from pathlib import Path
from typing import Optional, List, Union from typing import Optional, List, Union
import json import json
import asyncio import asyncio
from contextlib import nullcontext, asynccontextmanager # from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager from .async_database import async_db_manager
from .chunking_strategy import * from .chunking_strategy import *
@@ -15,6 +16,7 @@ from .content_filter_strategy import *
from .extraction_strategy import * from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from .content_scraping_strategy import WebScrapingStrategy from .content_scraping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger from .async_logger import AsyncLogger
from .async_configs import BrowserConfig, CrawlerRunConfig from .async_configs import BrowserConfig, CrawlerRunConfig
@@ -133,16 +135,11 @@ class AsyncWebCrawler:
async def __aexit__(self, exc_type, exc_val, exc_tb): async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb) await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
@asynccontextmanager
async def nullcontext(self):
yield
async def awarmup(self): async def awarmup(self):
"""Initialize the crawler with warm-up sequence.""" """Initialize the crawler with warm-up sequence."""
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
self.ready = True self.ready = True
@asynccontextmanager @asynccontextmanager
async def nullcontext(self): async def nullcontext(self):
"""异步空上下文管理器""" """异步空上下文管理器"""
@@ -323,7 +320,8 @@ class AsyncWebCrawler:
config=config, # Pass the config object instead of individual parameters config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data, screenshot=screenshot_data,
pdf_data=pdf_data, pdf_data=pdf_data,
verbose=config.verbose verbose=config.verbose,
**kwargs
) )
# Set response data # Set response data
@@ -424,7 +422,8 @@ class AsyncWebCrawler:
css_selector=config.css_selector, css_selector=config.css_selector,
only_text=config.only_text, only_text=config.only_text,
image_description_min_word_threshold=config.image_description_min_word_threshold, image_description_min_word_threshold=config.image_description_min_word_threshold,
content_filter=config.content_filter content_filter=config.content_filter,
**kwargs
) )
if result is None: if result is None:
@@ -435,16 +434,29 @@ class AsyncWebCrawler:
except Exception as e: except Exception as e:
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
# Extract results # Extract results
markdown_v2 = result.get("markdown_v2", None)
cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
fit_html = sanitize_input_encode(result.get("fit_html", "")) fit_html = sanitize_input_encode(result.get("fit_html", ""))
media = result.get("media", []) media = result.get("media", [])
links = result.get("links", []) links = result.get("links", [])
metadata = result.get("metadata", {}) metadata = result.get("metadata", {})
# Markdown Generation
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
if not config.content_filter and not markdown_generator.content_filter:
markdown_generator.content_filter = PruningContentFilter()
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,
base_url=url,
# html2text_options=kwargs.get('html2text', {})
)
markdown_v2 = markdown_result
markdown = sanitize_input_encode(markdown_result.raw_markdown)
# Log processing completion # Log processing completion
self.logger.info( self.logger.info(
message="Processed {url:.50}... | Time: {timing}ms", message="Processed {url:.50}... | Time: {timing}ms",

View File

@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ') cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
markdown_content = self._generate_markdown_content( # markdown_content = self._generate_markdown_content(
cleaned_html=cleaned_html, # cleaned_html=cleaned_html,
html=html, # html=html,
url=url, # url=url,
success=success, # success=success,
**kwargs # **kwargs
) # )
return { return {
**markdown_content, # **markdown_content,
'cleaned_html': cleaned_html, 'cleaned_html': cleaned_html,
'success': success, 'success': success,
'media': media, 'media': media,

View File

@@ -1,23 +1,21 @@
import os
import time
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import * from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import * from crawl4ai.crawler_strategy import *
import asyncio
from pydantic import BaseModel, Field
url = r'https://openai.com/api/pricing/' url = r'https://openai.com/api/pricing/'
crawler = WebCrawler()
crawler.warmup()
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel): class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.") model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.") input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.") output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
result = crawler.run( from crawl4ai import AsyncWebCrawler
async def main():
# Use AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url, url=url,
word_count_threshold=1, word_count_threshold=1,
extraction_strategy= LLMExtractionStrategy( extraction_strategy= LLMExtractionStrategy(
@@ -25,17 +23,18 @@ result = crawler.run(
provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
schema=OpenAIModelFee.model_json_schema(), schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema", extraction_type="schema",
instruction="From the crawled content, extract all mentioned model names along with their "\ instruction="From the crawled content, extract all mentioned model names along with their " \
"fees for input and output tokens. Make sure not to miss anything in the entire content. "\ "fees for input and output tokens. Make sure not to miss anything in the entire content. " \
'One extracted model JSON format should look like this: '\ 'One extracted model JSON format should look like this: ' \
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }' '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
), ),
bypass_cache=True,
)
model_fees = json.loads(result.extracted_content) )
print("Success:", result.success)
model_fees = json.loads(result.extracted_content)
print(len(model_fees))
print(len(model_fees)) with open(".data/data.json", "w", encoding="utf-8") as f:
with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content) f.write(result.extracted_content)
asyncio.run(main())

View File

@@ -142,6 +142,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
crawler_config = CrawlerRunConfig( crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
word_count_threshold=1, word_count_threshold=1,
page_timeout = 80000,
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider=provider, provider=provider,
api_token=api_token, api_token=api_token,
@@ -497,21 +498,21 @@ async def main():
# Advanced examples # Advanced examples
# await extract_structured_data_using_css_extractor() # await extract_structured_data_using_css_extractor()
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2() # await crawl_dynamic_content_pages_method_2()
# Browser comparisons # Browser comparisons
await crawl_custom_browser_type() # await crawl_custom_browser_type()
# Performance testing # Performance testing
# await speed_comparison() # await speed_comparison()
# Screenshot example # Screenshot example
await capture_and_save_screenshot( # await capture_and_save_screenshot(
"https://www.example.com", # "https://www.example.com",
os.path.join(__location__, "tmp/example_screenshot.jpg") # os.path.join(__location__, "tmp/example_screenshot.jpg")
) # )
if __name__ == "__main__": if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())

View File

@@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
all_commits = [] all_commits = []
js_next_page = """ js_next_page = """
(() => {
const button = document.querySelector('a[data-testid="pagination-next-button"]'); const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click(); if (button) button.click();
})();
""" """
for page in range(3): # Crawl 3 pages for page in range(3): # Crawl 3 pages
@@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay():
async def main(): async def main():
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
await simple_crawl() # await simple_crawl()
await simple_example_with_running_js_code() # await simple_example_with_running_js_code()
await simple_example_with_css_selector() # await simple_example_with_css_selector()
# await use_proxy() # # await use_proxy()
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
await extract_structured_data_using_css_extractor() # await extract_structured_data_using_css_extractor()
# LLM extraction examples # LLM extraction examples
# await extract_structured_data_using_llm() # await extract_structured_data_using_llm()

View File

@@ -99,7 +99,7 @@ async def main():
remove_overlay_elements=True, remove_overlay_elements=True,
# Cache control # Cache control
cache_mode=CacheMode.ENABLE # Use cache if available cache_mode=CacheMode.ENABLED # Use cache if available
) )
if result.success: if result.success:

View File

@@ -57,6 +57,9 @@ setup(
author_email="unclecode@kidocode.com", author_email="unclecode@kidocode.com",
license="MIT", license="MIT",
packages=find_packages(), packages=find_packages(),
package_data={
'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure
},
install_requires=default_requirements install_requires=default_requirements
+ ["playwright", "aiofiles"], # Added aiofiles + ["playwright", "aiofiles"], # Added aiofiles
extras_require={ extras_require={