From b09a86c0c1bc1036ff4954da991dfbccf65534cd Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 24 Nov 2024 19:40:10 +0800 Subject: [PATCH 1/4] chore: remove deprecated Docker Compose configurations for crawl4ai service --- docker-compose.hub.yml | 27 --------------------------- docker-compose.local.yml | 33 --------------------------------- 2 files changed, 60 deletions(-) delete mode 100644 docker-compose.hub.yml delete mode 100644 docker-compose.local.yml diff --git a/docker-compose.hub.yml b/docker-compose.hub.yml deleted file mode 100644 index 9bcfa982..00000000 --- a/docker-compose.hub.yml +++ /dev/null @@ -1,27 +0,0 @@ -services: - crawl4ai: - image: unclecode/crawl4ai:basic # Pull image from Docker Hub - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s diff --git a/docker-compose.local.yml b/docker-compose.local.yml deleted file mode 100644 index 7dc41b47..00000000 --- a/docker-compose.local.yml +++ /dev/null @@ -1,33 +0,0 @@ -services: - crawl4ai: - build: - context: . - dockerfile: Dockerfile - args: - PYTHON_VERSION: 3.10 - INSTALL_TYPE: all - ENABLE_GPU: false - ports: - - "11235:11235" # FastAPI server - - "8000:8000" # Alternative port - - "9222:9222" # Browser debugging - - "8080:8080" # Additional port - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API token - - OPENAI_API_KEY=${OPENAI_API_KEY:-} # Optional OpenAI API key - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} # Optional Claude API key - volumes: - - /dev/shm:/dev/shm # Shared memory for browser operations - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s \ No newline at end of file From c6a022132b9fff4db14586a55c95f346ac3da5f7 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 27 Nov 2024 14:55:56 +0800 Subject: [PATCH 2/4] docs: update CONTRIBUTORS.md to acknowledge aadityakanjolia4 for fixing 'CustomHTML2Text' bug --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 0b5dcede..81e916cb 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -10,6 +10,7 @@ We would like to thank the following people for their contributions to Crawl4AI: ## Community Contributors +- [aadityakanjolia4](https://github.com/aadityakanjolia4) - Fixing 'CustomHTML2Text' is not defined bug in the code. - [FractalMind](https://github.com/FractalMind) - Created the first official Docker Hub image and fixed Dockerfile errors - [ketonkss4](https://github.com/ketonkss4) - Identified Selenium's new capabilities, helping reduce dependencies - [jonymusky](https://github.com/jonymusky) - Javascript execution documentation, and wait_for From 73661f7d1fd37111e34e4dc9ec10f87d5a5f3afe Mon Sep 17 00:00:00 2001 From: zhounan Date: Wed, 27 Nov 2024 15:04:20 +0800 Subject: [PATCH 3/4] docs: enhance development installation instructions (#286) Thanks for your contribution. I'm merging your changes and I'll add your name to our contributor list. Thank you so much. --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6c5e256e..5ba33dea 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,15 @@ For contributors who plan to modify the source code: ```bash git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -pip install -e . +pip install -e . # Basic installation in editable mode +``` +Install optional features: +```bash +pip install -e ".[torch]" # With PyTorch features +pip install -e ".[transformer]" # With Transformer features +pip install -e ".[cosine]" # With cosine similarity features +pip install -e ".[sync]" # With synchronous crawling (Selenium) +pip install -e ".[all]" # Install all optional features ``` ## One-Click Deployment 🚀 From f998e9e94906302a4ee32cd5e581f4fa7bd22021 Mon Sep 17 00:00:00 2001 From: Hamza Farhan Date: Wed, 27 Nov 2024 16:20:54 +0500 Subject: [PATCH 4/4] Fix: handled the cases where markdown_with_citations, references_markdown, and filtered_html might not be defined. (#293) Thanks, dear Farhan, for the changes you made in the code. I accepted and merged them into the main branch. Also, I will add your name to our contributor list. Thank you so much. --- crawl4ai/markdown_generation_strategy.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crawl4ai/markdown_generation_strategy.py b/crawl4ai/markdown_generation_strategy.py index 7922c413..249bc1ce 100644 --- a/crawl4ai/markdown_generation_strategy.py +++ b/crawl4ai/markdown_generation_strategy.py @@ -84,6 +84,8 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations + markdown_with_citations: str = "" + references_markdown: str = "" if citations: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url @@ -91,9 +93,9 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): # Generate fit markdown if content filter is provided fit_markdown: Optional[str] = None + filtered_html: Optional[str] = None if content_filter: - filtered_html = content_filter.filter_content(cleaned_html) - filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) + filtered_html = '\n'.join('
{}
'.format(s) for s in content_filter.filter_content(cleaned_html)) fit_markdown = h.handle(filtered_html) return MarkdownGenerationResult( @@ -101,7 +103,7 @@ class DefaultMarkdownGenerationStrategy(MarkdownGenerationStrategy): markdown_with_citations=markdown_with_citations, references_markdown=references_markdown, fit_markdown=fit_markdown, - fit_html=filtered_html + fit_html=filtered_html, ) def fast_urljoin(base: str, url: str) -> str: