Delete .do/deploy.template.yaml

Update README.md (#390 )
Update README.md (#389 )
2024-12-31 17:33:39 +08:00 · 2024-12-30 21:24:43 +08:00 · 2024-12-30 21:24:06 +08:00 · 2024-12-27 17:42:59 +08:00 · 2024-12-24 19:56:07 +08:00 · 2024-12-15 19:49:38 +08:00
12 changed files with 87 additions and 87 deletions
--- a/.do/deploy.template.yaml
+++ b/.do/deploy.template.yaml
@@ -1,22 +0,0 @@
-spec:
-  name: crawl4ai
-  services:
-    - name: crawl4ai
-      git:
-        branch: 0.3.74
-        repo_clone_url: https://github.com/unclecode/crawl4ai.git
-      dockerfile_path: Dockerfile
-      http_port: 11235
-      instance_count: 1
-      instance_size_slug: professional-xs
-      health_check:
-        http_path: /health
-      envs:
-        - key: INSTALL_TYPE
-          value: "basic"
-        - key: PYTHON_VERSION  
-          value: "3.10"
-        - key: ENABLE_GPU
-          value: "false"
-      routes:
-        - path: /
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,2 @@
-include requirements.txt
+include requirements.txt
+recursive-include crawl4ai/js_snippet *.js
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# 🚀🤖 Crawl4AI: Crawl Smarter, Faster, Freely. For AI.
+# 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.

 <a href="https://trendshift.io/repositories/11716" target="_blank"><img src="https://trendshift.io/api/badge/repositories/11716" alt="unclecode%2Fcrawl4ai | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>

--- a/crawl4ai/version.py
+++ b/crawl4ai/version.py
@@ -1,2 +1,2 @@
 # crawl4ai/_version.py
-__version__ = "0.4.2"
+__version__ = "0.4.22"
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -7,6 +7,7 @@ from .config import (
 from .user_agent_generator import UserAgentGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy
+from .markdown_generation_strategy import MarkdownGenerationStrategy

 class BrowserConfig:
    """
@@ -269,6 +270,7 @@ class CrawlerRunConfig:
        word_count_threshold: int =  MIN_WORD_THRESHOLD ,
        extraction_strategy : ExtractionStrategy=None,  # Will default to NoExtractionStrategy if None
        chunking_strategy : ChunkingStrategy= None,    # Will default to RegexChunking if None
+        markdown_generator : MarkdownGenerationStrategy = None,
        content_filter=None,
        cache_mode=None,
        session_id: str = None,
@@ -309,6 +311,7 @@ class CrawlerRunConfig:
        self.word_count_threshold = word_count_threshold
        self.extraction_strategy = extraction_strategy
        self.chunking_strategy = chunking_strategy
+        self.markdown_generator = markdown_generator
        self.content_filter = content_filter
        self.cache_mode = cache_mode
        self.session_id = session_id
@@ -364,6 +367,7 @@ class CrawlerRunConfig:
            word_count_threshold=kwargs.get("word_count_threshold", 200),
            extraction_strategy=kwargs.get("extraction_strategy"),
            chunking_strategy=kwargs.get("chunking_strategy"),
+            markdown_generator=kwargs.get("markdown_generator"),
            content_filter=kwargs.get("content_filter"),
            cache_mode=kwargs.get("cache_mode"),
            session_id=kwargs.get("session_id"),
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -7,7 +7,8 @@ from pathlib import Path
 from typing import Optional, List, Union
 import json
 import asyncio
-from contextlib import nullcontext, asynccontextmanager
+# from contextlib import nullcontext, asynccontextmanager
+from contextlib import asynccontextmanager
 from .models import CrawlResult, MarkdownGenerationResult
 from .async_database import async_db_manager
 from .chunking_strategy import *
@@ -15,6 +16,7 @@ from .content_filter_strategy import *
 from .extraction_strategy import *
 from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
 from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
+from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
 from .content_scraping_strategy import WebScrapingStrategy
 from .async_logger import AsyncLogger
 from .async_configs import BrowserConfig, CrawlerRunConfig
@@ -132,17 +134,12 @@ class AsyncWebCrawler:

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        await self.crawler_strategy.__aexit__(exc_type, exc_val, exc_tb)
-
-    @asynccontextmanager
-    async def nullcontext(self):
-        yield
    
    async def awarmup(self):
        """Initialize the crawler with warm-up sequence."""
        self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT")
        self.ready = True

-
    @asynccontextmanager
    async def nullcontext(self):
        """异步空上下文管理器"""
@@ -323,7 +320,8 @@ class AsyncWebCrawler:
                        config=config,  # Pass the config object instead of individual parameters
                        screenshot=screenshot_data,
                        pdf_data=pdf_data,
-                        verbose=config.verbose
+                        verbose=config.verbose,
+                        **kwargs
                    )

                    # Set response data
@@ -424,7 +422,8 @@ class AsyncWebCrawler:
                    css_selector=config.css_selector,
                    only_text=config.only_text,
                    image_description_min_word_threshold=config.image_description_min_word_threshold,
-                    content_filter=config.content_filter
+                    content_filter=config.content_filter,
+                    **kwargs
                )

                if result is None:
@@ -435,16 +434,29 @@ class AsyncWebCrawler:
            except Exception as e:
                raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")

+       
+
            # Extract results
-            markdown_v2 = result.get("markdown_v2", None)
            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
-            markdown = sanitize_input_encode(result.get("markdown", ""))
            fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
            fit_html = sanitize_input_encode(result.get("fit_html", ""))
            media = result.get("media", [])
            links = result.get("links", [])
            metadata = result.get("metadata", {})

+            # Markdown Generation
+            markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator()
+            if not config.content_filter and not markdown_generator.content_filter:
+                markdown_generator.content_filter = PruningContentFilter()
+            
+            markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
+                cleaned_html=cleaned_html,
+                base_url=url,
+                # html2text_options=kwargs.get('html2text', {})
+            )
+            markdown_v2 = markdown_result
+            markdown = sanitize_input_encode(markdown_result.raw_markdown)
+
            # Log processing completion
            self.logger.info(
                message="Processed {url:.50}... | Time: {timing}ms",
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -602,16 +602,16 @@ class WebScrapingStrategy(ContentScrapingStrategy):

        cleaned_html = str_body.replace('\n\n', '\n').replace('  ', ' ')

-        markdown_content = self._generate_markdown_content(
-            cleaned_html=cleaned_html,
-            html=html,
-            url=url,
-            success=success,
-            **kwargs
-        )
+        # markdown_content = self._generate_markdown_content(
+        #     cleaned_html=cleaned_html,
+        #     html=html,
+        #     url=url,
+        #     success=success,
+        #     **kwargs
+        # )
        
        return {
-            **markdown_content,
+            # **markdown_content,
            'cleaned_html': cleaned_html,
            'success': success,
            'media': media,
--- a/docs/examples/llm_extraction_openai_pricing.py
+++ b/docs/examples/llm_extraction_openai_pricing.py
@@ -1,41 +1,40 @@
-import os
-import time
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
 from crawl4ai.extraction_strategy import *
 from crawl4ai.crawler_strategy import *
+import asyncio
+from pydantic import BaseModel, Field

 url = r'https://openai.com/api/pricing/'

-crawler = WebCrawler()
-crawler.warmup()
-
-from pydantic import BaseModel, Field
-
 class OpenAIModelFee(BaseModel):
    model_name: str = Field(..., description="Name of the OpenAI model.")
    input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
    output_fee: str = Field(..., description="Fee for output token for the OpenAI model.")

-result = crawler.run(
-    url=url,
-    word_count_threshold=1,
-    extraction_strategy= LLMExtractionStrategy(
-        # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'), 
-        provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'), 
-        schema=OpenAIModelFee.model_json_schema(),
-        extraction_type="schema",
-        instruction="From the crawled content, extract all mentioned model names along with their "\
-            "fees for input and output tokens. Make sure not to miss anything in the entire content. "\
-            'One extracted model JSON format should look like this: '\
-            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
-    ),
-    bypass_cache=True,
-)
+from crawl4ai import AsyncWebCrawler

-model_fees = json.loads(result.extracted_content)
+async def main():
+    # Use AsyncWebCrawler
+    async with AsyncWebCrawler() as crawler:
+        result = await crawler.arun(
+            url=url,
+            word_count_threshold=1,
+            extraction_strategy= LLMExtractionStrategy(
+                # provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
+                provider= "groq/llama-3.1-70b-versatile", api_token = os.getenv('GROQ_API_KEY'),
+                schema=OpenAIModelFee.model_json_schema(),
+                extraction_type="schema",
+                instruction="From the crawled content, extract all mentioned model names along with their " \
+                            "fees for input and output tokens. Make sure not to miss anything in the entire content. " \
+                            'One extracted model JSON format should look like this: ' \
+                            '{ "model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens" }'
+            ),

-print(len(model_fees))
+        )
+        print("Success:", result.success)
+        model_fees = json.loads(result.extracted_content)
+        print(len(model_fees))

-with open(".data/data.json", "w", encoding="utf-8") as f:
-    f.write(result.extracted_content)
+        with open(".data/data.json", "w", encoding="utf-8") as f:
+            f.write(result.extracted_content)
+
+asyncio.run(main())
--- a/docs/examples/quickstart_async.config.py
+++ b/docs/examples/quickstart_async.config.py
@@ -142,6 +142,7 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
    crawler_config = CrawlerRunConfig(
        cache_mode=CacheMode.BYPASS,
        word_count_threshold=1,
+        page_timeout = 80000,
        extraction_strategy=LLMExtractionStrategy(
            provider=provider,
            api_token=api_token,
@@ -497,21 +498,21 @@ async def main():
    
    # Advanced examples
    # await extract_structured_data_using_css_extractor()
-    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
    
    # Browser comparisons
-    await crawl_custom_browser_type()
+    # await crawl_custom_browser_type()
    
    # Performance testing
    # await speed_comparison()

    # Screenshot example
-    await capture_and_save_screenshot(
-        "https://www.example.com",
-        os.path.join(__location__, "tmp/example_screenshot.jpg")
-    )
+    # await capture_and_save_screenshot(
+    #     "https://www.example.com",
+    #     os.path.join(__location__, "tmp/example_screenshot.jpg")
+    # )

 if __name__ == "__main__":
    asyncio.run(main())
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -239,8 +239,10 @@ async def crawl_dynamic_content_pages_method_1():
        all_commits = []

        js_next_page = """
-        const button = document.querySelector('a[data-testid="pagination-next-button"]');
-        if (button) button.click();
+        (() => {
+            const button = document.querySelector('a[data-testid="pagination-next-button"]');
+            if (button) button.click();
+        })();
        """

        for page in range(3):  # Crawl 3 pages
@@ -604,14 +606,14 @@ async def fit_markdown_remove_overlay():


 async def main():
-    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
    
-    await simple_crawl()
-    await simple_example_with_running_js_code()
-    await simple_example_with_css_selector()
-    # await use_proxy()
-    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    await extract_structured_data_using_css_extractor()
+    # await simple_crawl()
+    # await simple_example_with_running_js_code()
+    # await simple_example_with_css_selector()
+    # # await use_proxy()
+    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    # await extract_structured_data_using_css_extractor()

    # LLM extraction examples
    # await extract_structured_data_using_llm()
--- a/docs/md_v2/basic/simple-crawling.md
+++ b/docs/md_v2/basic/simple-crawling.md
@@ -99,7 +99,7 @@ async def main():
            remove_overlay_elements=True,
            
            # Cache control
-            cache_mode=CacheMode.ENABLE  # Use cache if available
+            cache_mode=CacheMode.ENABLED  # Use cache if available
        )
        
        if result.success:
--- a/setup.py
+++ b/setup.py
@@ -57,6 +57,9 @@ setup(
    author_email="unclecode@kidocode.com",
    license="MIT",
    packages=find_packages(),
+    package_data={
+        'crawl4ai': ['js_snippet/*.js']  # This matches the exact path structure
+    },
    install_requires=default_requirements
    + ["playwright", "aiofiles"],  # Added aiofiles
    extras_require={
Author	SHA1	Message	Date
UncleCode	406702a77f	Delete .do/deploy.template.yaml	2024-12-31 17:33:39 +08:00
UncleCode	7391d6be73	Update README.md (#390 )	2024-12-30 21:24:43 +08:00
UncleCode	e4e23065f1	Update README.md (#389 )	2024-12-30 21:24:06 +08:00
Robin Singh	78768fd714	Update simple-crawling.md (#379 ) In the comprehensive example, AttributeError: type object 'CacheMode' has no attribute 'ENABLE'. Did you mean: 'ENABLED'?	2024-12-27 17:42:59 +08:00
Haopeng138	bacbeb3ed4	Fix #340 example llm_extraction (#358 ) @Haopeng138 Thank you so much. They are still part of the library. I forgot to update them since I moved the asynchronous versions years ago. I really appreciate it. I have to say that I feel weak in the documentation. That's why I spent a lot of time on it last week. Now, when you mention some of the things in the example folder, I realize I forgot about the example folder. I'll try to update it more. If you find anything else, please help and support. Thank you. I will add your name to contributor name as well.	2024-12-24 19:56:07 +08:00
UncleCode	ed7bc1909c	Bump version to 0.4.22	2024-12-15 19:49:38 +08:00
UncleCode	e9e5b5642d	Fix js_snipprt issue 0.4.21 bump to 0.4.22	2024-12-15 19:49:30 +08:00
UncleCode	7524aa7b5e	Feature: Add Markdown generation to CrawlerRunConfig - Added markdown generator parameter to CrawlerRunConfig in `async_configs.py`. - Implemented logic for Markdown generation in content scraping in `async_webcrawler.py`. - Updated version number to 0.4.21 in `__version__.py`.	2024-12-13 21:51:38 +08:00