Compare commits

...

8 Commits

Author SHA1 Message Date
UncleCode
406702a77f Delete .do/deploy.template.yaml 2024-12-31 17:33:39 +08:00
UncleCode
7391d6be73 Update README.md (#390) 2024-12-30 21:24:43 +08:00
UncleCode
e4e23065f1 Update README.md (#389) 2024-12-30 21:24:06 +08:00
Robin Singh
78768fd714 Update simple-crawling.md (#379)
In the comprehensive example,

AttributeError: type object 'CacheMode' has no attribute 'ENABLE'. Did you mean: 'ENABLED'?
2024-12-27 17:42:59 +08:00
Haopeng138
bacbeb3ed4 Fix #340 example llm_extraction (#358)
@Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.
2024-12-24 19:56:07 +08:00
UncleCode
ed7bc1909c Bump version to 0.4.22 2024-12-15 19:49:38 +08:00
UncleCode
e9e5b5642d Fix js_snipprt issue 0.4.21
bump to 0.4.22
2024-12-15 19:49:30 +08:00
UncleCode
7524aa7b5e Feature: Add Markdown generation to CrawlerRunConfig
- Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`.
  - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`.
  - Updated version number to 0.4.21 in `__version__.py`.
2024-12-13 21:51:38 +08:00
12 changed files with 87 additions and 87 deletions

View File

@@ -1,22 +0,0 @@
spec:
name: crawl4ai
services:
- name: crawl4ai
git:
branch: 0.3.74
repo_clone_url: https://github.com/unclecode/crawl4ai.git
dockerfile_path: Dockerfile
http_port: 11235
instance_count: 1
instance_size_slug: professional-xs
health_check:
http_path: /health
envs:
- key: INSTALL_TYPE
value: "basic"
- key: PYTHON_VERSION
value: "3.10"
- key: ENABLE_GPU
value: "false"
routes:
- path: /

View File

@@ -1 +1,2 @@
include requirements.txt
recursive-include crawl4ai/js_snippet *.js

View File

@@ -1,4 +1,4 @@
# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
# 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.
<a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py
__version__ = "0.4.2"
__version__ = "0.4.22"

View File

@@ -7,6 +7,7 @@ from .config import (
from .user_agent_generator import UserAgentGenerator
from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy
from .markdown_generation_strategy import MarkdownGenerationStrategy
class BrowserConfig:
"""
@@ -269,6 +270,7 @@ class CrawlerRunConfig:
word_count_threshold: int = MIN_WORD_THRESHOLD ,
extraction_strategy : ExtractionStrategy=None, # Will default to NoExtractionStrategy if None
chunking_strategy : ChunkingStrategy= None, # Will default to RegexChunking if None
markdown_generator : MarkdownGenerationStrategy = None,
content_filter=None,
cache_mode=None,
session_id: str = None,
@@ -309,6 +311,7 @@ class CrawlerRunConfig:
self.word_count_threshold = word_count_threshold
self.extraction_strategy = extraction_strategy
self.chunking_strategy = chunking_strategy
self.markdown_generator = markdown_generator
self.content_filter = content_filter
self.cache_mode = cache_mode
self.session_id = session_id
@@ -364,6 +367,7 @@ class CrawlerRunConfig:
word_count_threshold=kwargs.get("word_count_threshold", 200),
extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy"),
markdown_generator=kwargs.get("markdown_generator"),
content_filter=kwargs.get("content_filter"),
cache_mode=kwargs.get("cache_mode"),
session_id=kwargs.get("session_id"),

View File

@@ -7,7 +7,8 @@ from pathlib import Path
from typing import Optional, List, Union
import json
import asyncio
from contextlib import nullcontext, asynccontextmanager
# from contextlib import nullcontext, asynccontextmanager
from contextlib import asynccontextmanager
from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager
from .chunking_strategy import *
@@ -15,6 +16,7 @@ from .content_filter_strategy import *
from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from .content_scraping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger
from .async_configs import BrowserConfig, CrawlerRunConfig
@@ -133,16 +135,11 @@ class AsyncWebCrawler:
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
@asynccontextmanager
async def nullcontext(self):
yield
async def awarmup(self):
"""Initialize the crawler with warm-up sequence."""
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
self.ready = True
@asynccontextmanager
async def nullcontext(self):
"""异步空上下文管理器"""
@@ -323,7 +320,8 @@ class AsyncWebCrawler:
config=config, # Pass the config object instead of individual parameters
screenshot=screenshot_data,
pdf_data=pdf_data,
verbose=config.verbose
verbose=config.verbose,
**kwargs
)
# Set response data
@@ -424,7 +422,8 @@ class AsyncWebCrawler:
css_selector=config.css_selector,
only_text=config.only_text,
image_description_min_word_threshold=config.image_description_min_word_threshold,
content_filter=config.content_filter
content_filter=config.content_filter,
**kwargs
)
if result is None:
@@ -435,16 +434,29 @@ class AsyncWebCrawler:
except Exception as e:
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
# Extract results
markdown_v2 = result.get("markdown_v2", None)
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
fit_html = sanitize_input_encode(result.get("fit_html", ""))
media = result.get("media", [])
links = result.get("links", [])
metadata = result.get("metadata", {})
# Markdown Generation
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
if not config.content_filter and not markdown_generator.content_filter:
markdown_generator.content_filter = PruningContentFilter()
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,
base_url=url,
# html2text_options=kwargs.get('html2text', {})
)
markdown_v2 = markdown_result
markdown = sanitize_input_encode(markdown_result.raw_markdown)
# Log processing completion
self.logger.info(
message="Processed {url:.50}... | Time: {timing}ms",

View File

@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):
cleaned_html = str_body.replace('\n\n', '\n').replace(' ', ' ')
markdown_content = self._generate_markdown_content(
cleaned_html=cleaned_html,
html=html,
url=url,
success=success,
**kwargs
)
# markdown_content = self._generate_markdown_content(
# cleaned_html=cleaned_html,
# html=html,
# url=url,
# success=success,
# **kwargs
# )
return {
**markdown_content,
# **markdown_content,
'cleaned_html': cleaned_html,
'success': success,
'media': media,

View File

@@ -1,23 +1,21 @@
import os
import time
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *
import asyncio
from pydantic import BaseModel, Field
url = r'https://openai.com/api/pricing/'
crawler = WebCrawler()
crawler.warmup()
from pydantic import BaseModel, Field
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")
result = crawler.run(
from crawl4ai import AsyncWebCrawler
async def main():
# Use AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
word_count_threshold=1,
extraction_strategy= LLMExtractionStrategy(
@@ -30,12 +28,13 @@ result = crawler.run(
'One extracted model JSON format should look like this: ' \
'{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
),
bypass_cache=True,
)
print("Success:", result.success)
model_fees = json.loads(result.extracted_content)
print(len(model_fees))
with open(".data/data.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)
asyncio.run(main())

View File

@@ -142,6 +142,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=1,
page_timeout = 80000,
extraction_strategy=LLMExtractionStrategy(
provider=provider,
api_token=api_token,
@@ -497,21 +498,21 @@ async def main():
# Advanced examples
# await extract_structured_data_using_css_extractor()
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
# Browser comparisons
await crawl_custom_browser_type()
# await crawl_custom_browser_type()
# Performance testing
# await speed_comparison()
# Screenshot example
await capture_and_save_screenshot(
"https://www.example.com",
os.path.join(__location__, "tmp/example_screenshot.jpg")
)
# await capture_and_save_screenshot(
# "https://www.example.com",
# os.path.join(__location__, "tmp/example_screenshot.jpg")
# )
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
all_commits = []
js_next_page = """
(() => {
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
})();
"""
for page in range(3): # Crawl 3 pages
@@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay():
async def main():
await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
await simple_crawl()
await simple_example_with_running_js_code()
await simple_example_with_css_selector()
# await use_proxy()
await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
await extract_structured_data_using_css_extractor()
# await simple_crawl()
# await simple_example_with_running_js_code()
# await simple_example_with_css_selector()
# # await use_proxy()
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
# await extract_structured_data_using_css_extractor()
# LLM extraction examples
# await extract_structured_data_using_llm()

View File

@@ -99,7 +99,7 @@ async def main():
remove_overlay_elements=True,
# Cache control
cache_mode=CacheMode.ENABLE # Use cache if available
cache_mode=CacheMode.ENABLED # Use cache if available
)
if result.success:

View File

@@ -57,6 +57,9 @@ setup(
author_email="unclecode@kidocode.com",
license="MIT",
packages=find_packages(),
package_data={
'crawl4ai': ['js_snippet/*.js'] # This matches the exact path structure
},
install_requires=default_requirements
+ ["playwright", "aiofiles"], # Added aiofiles
extras_require={