From c0e87abaee97e9e206eb787f8939fdf8790f4a2b Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 28 Nov 2024 21:43:08 +0800 Subject: [PATCH 1/7] fix: update package versions in requirements.txt for compatibility --- requirements.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index ed259ac9..c0f6f183 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,16 +1,16 @@ aiosqlite~=0.20 html2text~=2024.2 lxml~=5.3 -litellm~=1.48 +litellm>=1.53.1 numpy>=1.26.0,<3 pillow~=10.4 -playwright>=1.47,<1.48 +playwright>=1.49.0 python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 -tf-playwright-stealth~=1.0 +tf-playwright-stealth>=1.1.0 xxhash~=3.4 rank-bm25~=0.2 -aiofiles~=24.0 +aiofiles>=24.1.0 colorama~=0.4 snowballstemmer~=2.2 \ No newline at end of file From 449dd7cc0b9d81e0f602b3868b478c8515a45bf1 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 29 Nov 2024 14:45:04 +0800 Subject: [PATCH 2/7] Migrating from the classic setup.py to a using PyProject approach. --- MANIFEST.in | 1 - build_hooks.py | 48 +++++++++++ docs/examples/quickstart_async.py | 128 +++++++++++++++++----------- plugin.py | 9 ++ post_install.py | 19 +++++ pyproject.toml | 75 ++++++++++++++++ requirements.txt | 16 ---- setup.cfg | 2 - setup.py | 136 ------------------------------ 9 files changed, 229 insertions(+), 205 deletions(-) delete mode 100644 MANIFEST.in create mode 100644 build_hooks.py create mode 100644 plugin.py create mode 100644 post_install.py create mode 100644 pyproject.toml delete mode 100644 requirements.txt delete mode 100644 setup.cfg delete mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 540b7204..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include requirements.txt \ No newline at end of file diff --git a/build_hooks.py b/build_hooks.py new file mode 100644 index 00000000..e59b5910 --- /dev/null +++ b/build_hooks.py @@ -0,0 +1,48 @@ +import os +import shutil +from pathlib import Path +import subprocess +import sys +from hatchling.builders.hooks.plugin.interface import BuildHookInterface +PLUGIN = "CustomBuildHook" + +class CustomBuildHook(BuildHookInterface): + def initialize(self, version, build_data): + # Create the .crawl4ai folder structure + base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") + crawl4ai_folder = Path(base_dir) if base_dir else Path.home() + crawl4ai_folder = crawl4ai_folder / ".crawl4ai" + cache_folder = crawl4ai_folder / "cache" + content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", + ] + + # Clean up old cache if exists + if cache_folder.exists(): + shutil.rmtree(cache_folder) + + # Create new folder structure + crawl4ai_folder.mkdir(exist_ok=True) + cache_folder.mkdir(exist_ok=True) + for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + + # Install Playwright browsers + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + except Exception as e: + print(f"Warning: Playwright installation failed: {e}") + print("Please run 'python -m playwright install' manually after installation") + + # Initialize database + try: + from crawl4ai.async_database import async_db_manager + import asyncio + asyncio.run(async_db_manager.initialize()) + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") \ No newline at end of file diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9f1eff53..01f7677c 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -32,7 +32,7 @@ print("Website: https://crawl4ai.com") async def simple_crawl(): print("\n--- Basic Usage ---") async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.nbcnews.com/business") + result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) # Print first 500 characters async def simple_example_with_running_js_code(): @@ -76,16 +76,17 @@ async def use_proxy(): async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", - bypass_cache=True + cache_mode= CacheMode.BYPASS ) - print(result.markdown[:500]) # Print first 500 characters + if result.success: + print(result.markdown[:500]) # Print first 500 characters async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url=url, screenshot=True, - bypass_cache=True + cache_mode= CacheMode.BYPASS ) if result.success and result.screenshot: @@ -141,41 +142,68 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") schema = { - "name": "Coinbase Crypto Prices", - "baseSelector": ".cds-tableRow-t45thuk", - "fields": [ - { - "name": "crypto", - "selector": "td:nth-child(1) h2", - "type": "text", - }, - { - "name": "symbol", - "selector": "td:nth-child(1) p", - "type": "text", - }, - { - "name": "price", - "selector": "td:nth-child(2)", - "type": "text", + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] +} + + async with AsyncWebCrawler( + headless=True, + verbose=True + ) as crawler: + + # Create the JavaScript that handles clicking multiple times + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + + for(let tab of tabs) { + // scroll to the tab + tab.scrollIntoView(); + tab.click(); + // Wait for content to load and animations to complete + await new Promise(r => setTimeout(r, 500)); } - ], - } + })(); + """ - extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - - async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( - url="https://www.coinbase.com/explore", - extraction_strategy=extraction_strategy, - cache_mode=CacheMode.BYPASS, + url="https://www.kidocode.com/degrees/technology", + extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), + js_code=[js_click_tabs], + cache_mode=CacheMode.BYPASS ) - assert result.success, "Failed to crawl the page" - - news_teasers = json.loads(result.extracted_content) - print(f"Successfully extracted {len(news_teasers)} news teasers") - print(json.dumps(news_teasers[0], indent=2)) + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) # Advanced Session-Based Crawling with Dynamic Content 🔄 async def crawl_dynamic_content_pages_method_1(): @@ -363,21 +391,21 @@ async def crawl_custom_browser_type(): # Use Firefox start = time.time() async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use WebKit start = time.time() async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use Chromium (default) start = time.time() async with AsyncWebCrawler(verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) @@ -534,29 +562,29 @@ async def fit_markdown_remove_overlay(): async def main(): - await simple_crawl() - await simple_example_with_running_js_code() - await simple_example_with_css_selector() - await use_proxy() - await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - await extract_structured_data_using_css_extractor() + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy - custom_headers = { - "Authorization": "Bearer your-custom-token", - "X-Custom-Header": "Some-Value" - } - await extract_structured_data_using_llm(extra_headers=custom_headers) + # custom_headers = { + # "Authorization": "Bearer your-custom-token", + # "X-Custom-Header": "Some-Value" + # } + # await extract_structured_data_using_llm(extra_headers=custom_headers) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() - await crawl_dynamic_content_pages_method_3() + # await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/plugin.py b/plugin.py new file mode 100644 index 00000000..1e1b11bf --- /dev/null +++ b/plugin.py @@ -0,0 +1,9 @@ +from colorama import Fore, Style +import subprocess +import sys + +def post_install(): + print(f"\n{Fore.YELLOW}{'='*40}") + print(f"{Fore.RED}IMPORTANT: Run this command now:") + print(f"{Fore.GREEN}python -m playwright install") + print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") \ No newline at end of file diff --git a/post_install.py b/post_install.py new file mode 100644 index 00000000..e536e547 --- /dev/null +++ b/post_install.py @@ -0,0 +1,19 @@ +from colorama import Fore, Style +import subprocess +import sys +import distutils.log as log +from pathlib import Path + +def main(): + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + except: + print(f"\n{Fore.YELLOW}{'='*40}") + print(f"{Fore.RED}IMPORTANT: Run this command now:") + print(f"{Fore.GREEN}python -m playwright install") + print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..cfef8101 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,75 @@ +[build-system] +requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"] +build-backend = "hatchling.build" + +[project] +name = "Crawl4AI" +dynamic = ["version"] +description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.7" +authors = [ + { name = "Unclecode", email = "unclecode@kidocode.com" }, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] +dependencies = [ + "aiosqlite~=0.20", + "html2text~=2024.2", + "lxml~=5.3", + "litellm>=1.53.1", + "numpy>=1.26.0,<3", + "pillow~=10.4", + "playwright>=1.49.0", + "python-dotenv~=1.0", + "requests~=2.26", + "beautifulsoup4~=4.12", + "tf-playwright-stealth>=1.1.0", + "xxhash~=3.4", + "rank-bm25~=0.2", + "aiofiles>=24.1.0", + "colorama~=0.4", + "snowballstemmer~=2.2", +] + +[project.optional-dependencies] +torch = ["torch", "nltk", "scikit-learn"] +transformer = ["transformers", "tokenizers"] +cosine = ["torch", "transformers", "nltk"] +sync = ["selenium"] +all = [ + "torch", + "nltk", + "scikit-learn", + "transformers", + "tokenizers", + "selenium", +] + +[project.urls] +Homepage = "https://github.com/unclecode/crawl4ai" +Documentation = "https://crawl4ai.com/mkdocs/" + +[project.scripts] +crawl4ai-download-models = "crawl4ai.model_loader:main" +crawl4ai-migrate = "crawl4ai.migrations:main" +crawl4ai-post-install = "crawl4ai.post_install:main" + +[tool.hatch.version] +path = "crawl4ai/__version__.py" + +[tool.hatch.build.hooks.custom] +dependencies = ["hatch-fancy-pypi-readme>=22.5.0"] +path = "build_hooks.py" + +[project.entry-points.hatch] +crawl4ai = "crawl4ai.plugin:post_install" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c0f6f183..00000000 --- a/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -aiosqlite~=0.20 -html2text~=2024.2 -lxml~=5.3 -litellm>=1.53.1 -numpy>=1.26.0,<3 -pillow~=10.4 -playwright>=1.49.0 -python-dotenv~=1.0 -requests~=2.26 -beautifulsoup4~=4.12 -tf-playwright-stealth>=1.1.0 -xxhash~=3.4 -rank-bm25~=0.2 -aiofiles>=24.1.0 -colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 56490d6a..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[options] -include_package_data = True \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index d44169bf..00000000 --- a/setup.py +++ /dev/null @@ -1,136 +0,0 @@ -from setuptools import setup, find_packages -from setuptools.command.install import install -import os -from pathlib import Path -import shutil -import subprocess -import sys -import asyncio - -# Create the .crawl4ai folder in the user's home directory if it doesn't exist -# If the folder already exists, remove the cache folder -base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") -crawl4ai_folder = Path(base_dir) if base_dir else Path.home() -crawl4ai_folder = crawl4ai_folder / ".crawl4ai" -cache_folder = crawl4ai_folder / "cache" -content_folders = [ - "html_content", - "cleaned_html", - "markdown_content", - "extracted_content", - "screenshots", -] - -# Clean up old cache if exists -if cache_folder.exists(): - shutil.rmtree(cache_folder) - -# Create new folder structure -crawl4ai_folder.mkdir(exist_ok=True) -cache_folder.mkdir(exist_ok=True) -for folder in content_folders: - (crawl4ai_folder / folder).mkdir(exist_ok=True) - -# Read requirements and version -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -with open(os.path.join(__location__, "requirements.txt")) as f: - requirements = f.read().splitlines() - -with open("crawl4ai/__version__.py") as f: - for line in f: - if line.startswith("__version__"): - version = line.split("=")[1].strip().strip('"') - break - -# Define requirements -default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] -transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk"] -sync_requirements = ["selenium"] - - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - -setup( - name="Crawl4AI", - version=version, - description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type="text/markdown", - url="https://github.com/unclecode/crawl4ai", - author="Unclecode", - author_email="unclecode@kidocode.com", - license="MIT", - packages=find_packages(), - install_requires=default_requirements - + ["playwright", "aiofiles"], # Added aiofiles - extras_require={ - "torch": torch_requirements, - "transformer": transformer_requirements, - "cosine": cosine_similarity_requirements, - "sync": sync_requirements, - "all": default_requirements - + torch_requirements - + transformer_requirements - + cosine_similarity_requirements - + sync_requirements, - }, - entry_points={ - "console_scripts": [ - "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command - ], - }, - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], - python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, -) From 12e73d489846dc83c29347bf84646ad8daef6cfc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 16:01:19 +0800 Subject: [PATCH 3/7] refactor: remove legacy build hooks and setup files, migrate to setup.cfg and pyproject.toml --- MANIFEST.in | 1 + build_hooks.py | 48 ----------------- plugin.py | 9 ---- post_install.py | 19 ------- pyproject.toml | 75 -------------------------- requirements.txt | 16 ++++++ setup.cfg | 2 + setup.py | 136 +++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 155 insertions(+), 151 deletions(-) create mode 100644 MANIFEST.in delete mode 100644 build_hooks.py delete mode 100644 plugin.py delete mode 100644 post_install.py delete mode 100644 pyproject.toml create mode 100644 requirements.txt create mode 100644 setup.cfg create mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..540b7204 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt \ No newline at end of file diff --git a/build_hooks.py b/build_hooks.py deleted file mode 100644 index e59b5910..00000000 --- a/build_hooks.py +++ /dev/null @@ -1,48 +0,0 @@ -import os -import shutil -from pathlib import Path -import subprocess -import sys -from hatchling.builders.hooks.plugin.interface import BuildHookInterface -PLUGIN = "CustomBuildHook" - -class CustomBuildHook(BuildHookInterface): - def initialize(self, version, build_data): - # Create the .crawl4ai folder structure - base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") - crawl4ai_folder = Path(base_dir) if base_dir else Path.home() - crawl4ai_folder = crawl4ai_folder / ".crawl4ai" - cache_folder = crawl4ai_folder / "cache" - content_folders = [ - "html_content", - "cleaned_html", - "markdown_content", - "extracted_content", - "screenshots", - ] - - # Clean up old cache if exists - if cache_folder.exists(): - shutil.rmtree(cache_folder) - - # Create new folder structure - crawl4ai_folder.mkdir(exist_ok=True) - cache_folder.mkdir(exist_ok=True) - for folder in content_folders: - (crawl4ai_folder / folder).mkdir(exist_ok=True) - - # Install Playwright browsers - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - except Exception as e: - print(f"Warning: Playwright installation failed: {e}") - print("Please run 'python -m playwright install' manually after installation") - - # Initialize database - try: - from crawl4ai.async_database import async_db_manager - import asyncio - asyncio.run(async_db_manager.initialize()) - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") \ No newline at end of file diff --git a/plugin.py b/plugin.py deleted file mode 100644 index 1e1b11bf..00000000 --- a/plugin.py +++ /dev/null @@ -1,9 +0,0 @@ -from colorama import Fore, Style -import subprocess -import sys - -def post_install(): - print(f"\n{Fore.YELLOW}{'='*40}") - print(f"{Fore.RED}IMPORTANT: Run this command now:") - print(f"{Fore.GREEN}python -m playwright install") - print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") \ No newline at end of file diff --git a/post_install.py b/post_install.py deleted file mode 100644 index e536e547..00000000 --- a/post_install.py +++ /dev/null @@ -1,19 +0,0 @@ -from colorama import Fore, Style -import subprocess -import sys -import distutils.log as log -from pathlib import Path - -def main(): - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL) - except: - print(f"\n{Fore.YELLOW}{'='*40}") - print(f"{Fore.RED}IMPORTANT: Run this command now:") - print(f"{Fore.GREEN}python -m playwright install") - print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index cfef8101..00000000 --- a/pyproject.toml +++ /dev/null @@ -1,75 +0,0 @@ -[build-system] -requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"] -build-backend = "hatchling.build" - -[project] -name = "Crawl4AI" -dynamic = ["version"] -description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" -readme = "README.md" -license = "Apache-2.0" -requires-python = ">=3.7" -authors = [ - { name = "Unclecode", email = "unclecode@kidocode.com" }, -] -classifiers = [ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", -] -dependencies = [ - "aiosqlite~=0.20", - "html2text~=2024.2", - "lxml~=5.3", - "litellm>=1.53.1", - "numpy>=1.26.0,<3", - "pillow~=10.4", - "playwright>=1.49.0", - "python-dotenv~=1.0", - "requests~=2.26", - "beautifulsoup4~=4.12", - "tf-playwright-stealth>=1.1.0", - "xxhash~=3.4", - "rank-bm25~=0.2", - "aiofiles>=24.1.0", - "colorama~=0.4", - "snowballstemmer~=2.2", -] - -[project.optional-dependencies] -torch = ["torch", "nltk", "scikit-learn"] -transformer = ["transformers", "tokenizers"] -cosine = ["torch", "transformers", "nltk"] -sync = ["selenium"] -all = [ - "torch", - "nltk", - "scikit-learn", - "transformers", - "tokenizers", - "selenium", -] - -[project.urls] -Homepage = "https://github.com/unclecode/crawl4ai" -Documentation = "https://crawl4ai.com/mkdocs/" - -[project.scripts] -crawl4ai-download-models = "crawl4ai.model_loader:main" -crawl4ai-migrate = "crawl4ai.migrations:main" -crawl4ai-post-install = "crawl4ai.post_install:main" - -[tool.hatch.version] -path = "crawl4ai/__version__.py" - -[tool.hatch.build.hooks.custom] -dependencies = ["hatch-fancy-pypi-readme>=22.5.0"] -path = "build_hooks.py" - -[project.entry-points.hatch] -crawl4ai = "crawl4ai.plugin:post_install" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..c0f6f183 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +aiosqlite~=0.20 +html2text~=2024.2 +lxml~=5.3 +litellm>=1.53.1 +numpy>=1.26.0,<3 +pillow~=10.4 +playwright>=1.49.0 +python-dotenv~=1.0 +requests~=2.26 +beautifulsoup4~=4.12 +tf-playwright-stealth>=1.1.0 +xxhash~=3.4 +rank-bm25~=0.2 +aiofiles>=24.1.0 +colorama~=0.4 +snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..56490d6a --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[options] +include_package_data = True \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..d44169bf --- /dev/null +++ b/setup.py @@ -0,0 +1,136 @@ +from setuptools import setup, find_packages +from setuptools.command.install import install +import os +from pathlib import Path +import shutil +import subprocess +import sys +import asyncio + +# Create the .crawl4ai folder in the user's home directory if it doesn't exist +# If the folder already exists, remove the cache folder +base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") +crawl4ai_folder = Path(base_dir) if base_dir else Path.home() +crawl4ai_folder = crawl4ai_folder / ".crawl4ai" +cache_folder = crawl4ai_folder / "cache" +content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", +] + +# Clean up old cache if exists +if cache_folder.exists(): + shutil.rmtree(cache_folder) + +# Create new folder structure +crawl4ai_folder.mkdir(exist_ok=True) +cache_folder.mkdir(exist_ok=True) +for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + +# Read requirements and version +__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) +with open(os.path.join(__location__, "requirements.txt")) as f: + requirements = f.read().splitlines() + +with open("crawl4ai/__version__.py") as f: + for line in f: + if line.startswith("__version__"): + version = line.split("=")[1].strip().strip('"') + break + +# Define requirements +default_requirements = requirements +torch_requirements = ["torch", "nltk", "scikit-learn"] +transformer_requirements = ["transformers", "tokenizers"] +cosine_similarity_requirements = ["torch", "transformers", "nltk"] +sync_requirements = ["selenium"] + + +def install_playwright(): + print("Installing Playwright browsers...") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + print("Playwright installation completed successfully.") + except subprocess.CalledProcessError as e: + print(f"Error during Playwright installation: {e}") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + print(f"Unexpected error during Playwright installation: {e}") + print( + "Please run 'python -m playwright install' manually after the installation." + ) + + +def run_migration(): + """Initialize database during installation""" + try: + print("Starting database initialization...") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + print("Database initialization completed successfully.") + except ImportError: + print("Warning: Database module not found. Will initialize on first use.") + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") + + +class PostInstallCommand(install): + def run(self): + install.run(self) + install_playwright() + # run_migration() + + +setup( + name="Crawl4AI", + version=version, + description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type="text/markdown", + url="https://github.com/unclecode/crawl4ai", + author="Unclecode", + author_email="unclecode@kidocode.com", + license="MIT", + packages=find_packages(), + install_requires=default_requirements + + ["playwright", "aiofiles"], # Added aiofiles + extras_require={ + "torch": torch_requirements, + "transformer": transformer_requirements, + "cosine": cosine_similarity_requirements, + "sync": sync_requirements, + "all": default_requirements + + torch_requirements + + transformer_requirements + + cosine_similarity_requirements + + sync_requirements, + }, + entry_points={ + "console_scripts": [ + "crawl4ai-download-models=crawl4ai.model_loader:main", + "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + ], + }, + classifiers=[ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + ], + python_requires=">=3.7", + cmdclass={ + "install": PostInstallCommand, + }, +) From d202f3539bf7447f7594f7f1897c3062c337ae52 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 18:48:44 +0800 Subject: [PATCH 4/7] Enhance installation and migration processes - Added a post-installation setup script for initialization. - Updated README with installation notes for Playwright setup. - Enhanced migration logging for better error visibility. - Added 'pydantic' to requirements. - Bumped version to 0.3.746. --- README.md | 32 ++----------------- crawl4ai/__init__.py | 1 - crawl4ai/__version__.py | 2 +- crawl4ai/install.py | 44 ++++++++++++++++++++++++++ crawl4ai/migrations.py | 40 ++++++++++++++++-------- docs/examples/quickstart_async.py | 18 +++++------ requirements.txt | 4 +-- setup.py | 51 ++----------------------------- 8 files changed, 90 insertions(+), 102 deletions(-) create mode 100644 crawl4ai/install.py diff --git a/README.md b/README.md index e8e6cddf..bbfa5858 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant 1. Install Crawl4AI: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` 2. Run a simple web crawl: @@ -125,34 +126,6 @@ if __name__ == "__main__": ✨ Visit our [Documentation Website](https://crawl4ai.com/mkdocs/) -## Features ✨ - -- 🆓 Completely free and open-source -- 🚀 Blazing fast performance, outperforming many paid services -- 🤖 LLM-friendly output formats (JSON, cleaned HTML, markdown) -- 🌐 Multi-browser support (Chromium, Firefox, WebKit) -- 🌍 Supports crawling multiple URLs simultaneously -- 🎨 Extracts and returns all media tags (Images, Audio, and Video) -- 🔗 Extracts all external and internal links -- 📚 Extracts metadata from the page -- 🔄 Custom hooks for authentication, headers, and page modifications -- 🕵️ User-agent customization -- 🖼️ Takes screenshots of pages with enhanced error handling -- 📜 Executes multiple custom JavaScripts before crawling -- 📊 Generates structured output without LLM using JsonCssExtractionStrategy -- 📚 Various chunking strategies: topic-based, regex, sentence, and more -- 🧠 Advanced extraction strategies: cosine clustering, LLM, and more -- 🎯 CSS selector support for precise data extraction -- 📝 Passes instructions/keywords to refine extraction -- 🔒 Proxy support with authentication for enhanced access -- 🔄 Session management for complex multi-page crawling -- 🌐 Asynchronous architecture for improved performance -- 🖼️ Improved image processing with lazy-loading detection -- 🕰️ Enhanced handling of delayed content loading -- 🔑 Custom headers support for LLM interactions -- 🖼️ iframe content extraction for comprehensive analysis -- ⏱️ Flexible timeout and delayed content retrieval options - ## Installation 🛠️ Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package or use Docker. @@ -168,11 +141,12 @@ For basic web crawling and scraping tasks: ```bash pip install crawl4ai +crawl4ai-setup # Setup the browser ``` By default, this will install the asynchronous version of Crawl4AI, using Playwright for web crawling. -👉 **Note**: When you install Crawl4AI, the setup script should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: +👉 **Note**: When you install Crawl4AI, the `crawl4ai-setup` should automatically install and set up Playwright. However, if you encounter any Playwright-related errors, you can manually install it using one of these methods: 1. Through the command line: diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 0ccf13d8..cee7c25b 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -4,7 +4,6 @@ from .async_webcrawler import AsyncWebCrawler, CacheMode from .models import CrawlResult from .__version__ import __version__ -# __version__ = "0.3.73" __all__ = [ "AsyncWebCrawler", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index 8b69d491..4a938b75 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.3.745" +__version__ = "0.3.746" diff --git a/crawl4ai/install.py b/crawl4ai/install.py new file mode 100644 index 00000000..71fe30ea --- /dev/null +++ b/crawl4ai/install.py @@ -0,0 +1,44 @@ +import subprocess +import sys +import asyncio +from .async_logger import AsyncLogger, LogLevel + +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +def post_install(): + """Run all post-installation tasks""" + logger.info("Running post-installation setup...", tag="INIT") + install_playwright() + run_migration() + logger.success("Post-installation setup completed!", tag="COMPLETE") + +def install_playwright(): + logger.info("Installing Playwright browsers...", tag="INIT") + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + logger.success("Playwright installation completed successfully.", tag="COMPLETE") + except subprocess.CalledProcessError as e: + logger.error(f"Error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + except Exception as e: + logger.error(f"Unexpected error during Playwright installation: {e}", tag="ERROR") + logger.warning( + "Please run 'python -m playwright install' manually after the installation." + ) + +def run_migration(): + """Initialize database during installation""" + try: + logger.info("Starting database initialization...", tag="INIT") + from crawl4ai.async_database import async_db_manager + + asyncio.run(async_db_manager.initialize()) + logger.success("Database initialization completed successfully.", tag="COMPLETE") + except ImportError: + logger.warning("Database module not found. Will initialize on first use.") + except Exception as e: + logger.warning(f"Database initialization failed: {e}") + logger.warning("Database will be initialized on first use") \ No newline at end of file diff --git a/crawl4ai/migrations.py b/crawl4ai/migrations.py index 77616086..3386b0fb 100644 --- a/crawl4ai/migrations.py +++ b/crawl4ai/migrations.py @@ -9,9 +9,13 @@ import aiofiles import shutil import time from datetime import datetime +from .async_logger import AsyncLogger, LogLevel -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) +# Initialize logger +logger = AsyncLogger(log_level=LogLevel.DEBUG, verbose=True) + +# logging.basicConfig(level=logging.INFO) +# logger = logging.getLogger(__name__) class DatabaseMigration: def __init__(self, db_path: str): @@ -55,7 +59,8 @@ class DatabaseMigration: async def migrate_database(self): """Migrate existing database to file-based storage""" - logger.info("Starting database migration...") + # logger.info("Starting database migration...") + logger.info("Starting database migration...", tag="INIT") try: async with aiosqlite.connect(self.db_path) as db: @@ -91,19 +96,25 @@ class DatabaseMigration: migrated_count += 1 if migrated_count % 100 == 0: - logger.info(f"Migrated {migrated_count} records...") + logger.info(f"Migrated {migrated_count} records...", tag="INIT") + await db.commit() - logger.info(f"Migration completed. {migrated_count} records processed.") + logger.success(f"Migration completed. {migrated_count} records processed.", tag="COMPLETE") except Exception as e: - logger.error(f"Migration failed: {e}") - raise + # logger.error(f"Migration failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def backup_database(db_path: str) -> str: """Create backup of existing database""" if not os.path.exists(db_path): - logger.info("No existing database found. Skipping backup.") + logger.info("No existing database found. Skipping backup.", tag="INIT") return None # Create backup with timestamp @@ -116,11 +127,16 @@ async def backup_database(db_path: str) -> str: # Create backup shutil.copy2(db_path, backup_path) - logger.info(f"Database backup created at: {backup_path}") + logger.info(f"Database backup created at: {backup_path}", tag="COMPLETE") return backup_path except Exception as e: - logger.error(f"Backup failed: {e}") - raise + # logger.error(f"Backup failed: {e}") + logger.error( + message="Migration failed: {error}", + tag="ERROR", + params={"error": str(e)} + ) + raise e async def run_migration(db_path: Optional[str] = None): """Run database migration""" @@ -128,7 +144,7 @@ async def run_migration(db_path: Optional[str] = None): db_path = os.path.join(Path.home(), ".crawl4ai", "crawl4ai.db") if not os.path.exists(db_path): - logger.info("No existing database found. Skipping migration.") + logger.info("No existing database found. Skipping migration.", tag="INIT") return # Create backup first diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 01f7677c..679a9bc2 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -562,18 +562,18 @@ async def fit_markdown_remove_overlay(): async def main(): - # await simple_crawl() - # await simple_example_with_running_js_code() - # await simple_example_with_css_selector() + await simple_crawl() + await simple_example_with_running_js_code() + await simple_example_with_css_selector() # await use_proxy() - # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - # await extract_structured_data_using_css_extractor() + await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy # custom_headers = { @@ -582,9 +582,9 @@ async def main(): # } # await extract_structured_data_using_llm(extra_headers=custom_headers) - # await crawl_dynamic_content_pages_method_1() - # await crawl_dynamic_content_pages_method_2() - # await crawl_dynamic_content_pages_method_3() + await crawl_dynamic_content_pages_method_1() + await crawl_dynamic_content_pages_method_2() + await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/requirements.txt b/requirements.txt index c0f6f183..741e12ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ aiosqlite~=0.20 -html2text~=2024.2 lxml~=5.3 litellm>=1.53.1 numpy>=1.26.0,<3 @@ -13,4 +12,5 @@ xxhash~=3.4 rank-bm25~=0.2 aiofiles>=24.1.0 colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file +snowballstemmer~=2.2 +pydantic>=2.10 \ No newline at end of file diff --git a/setup.py b/setup.py index d44169bf..e6840cd0 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,8 @@ from setuptools import setup, find_packages -from setuptools.command.install import install import os from pathlib import Path import shutil -import subprocess -import sys -import asyncio + # Create the .crawl4ai folder in the user's home directory if it doesn't exist # If the folder already exists, remove the cache folder @@ -49,46 +46,6 @@ transformer_requirements = ["transformers", "tokenizers"] cosine_similarity_requirements = ["torch", "transformers", "nltk"] sync_requirements = ["selenium"] - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - setup( name="Crawl4AI", version=version, @@ -116,7 +73,8 @@ setup( entry_points={ "console_scripts": [ "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command + "crawl4ai-migrate=crawl4ai.migrations:main", + 'crawl4ai-setup=crawl4ai.install:post_install', ], }, classifiers=[ @@ -130,7 +88,4 @@ setup( "Programming Language :: Python :: 3.10", ], python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, ) From 93bf3e8a1f87760e04d6a18b2e27bae0f5d5da0e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:08:09 +0800 Subject: [PATCH 5/7] Refactor Dockerfile and clean up main.py - Enhanced Dockerfile for platform-specific installations - Added ARG for TARGETPLATFORM and BUILDPLATFORM - Improved GPU support conditional on TARGETPLATFORM - Removed static pages mounting in main.py - Streamlined code structure to improve maintainability --- Dockerfile | 25 ++++++++++++++++--------- main.py | 4 ---- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index bd71deae..2997590a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,9 @@ # syntax=docker/dockerfile:1.4 -# Build arguments +ARG TARGETPLATFORM +ARG BUILDPLATFORM + +# Other build arguments ARG PYTHON_VERSION=3.10 # Base stage with system dependencies @@ -63,13 +66,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ && rm -rf /var/lib/apt/lists/* # GPU support if enabled and architecture is supported -RUN if [ "$ENABLE_GPU" = "true" ] && [ "$(dpkg --print-architecture)" != "arm64" ] ; then \ - apt-get update && apt-get install -y --no-install-recommends \ - nvidia-cuda-toolkit \ - && rm -rf /var/lib/apt/lists/* ; \ - else \ - echo "Skipping NVIDIA CUDA Toolkit installation (unsupported architecture or GPU disabled)"; \ - fi +RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \ + apt-get update && apt-get install -y --no-install-recommends \ + nvidia-cuda-toolkit \ + && rm -rf /var/lib/apt/lists/* ; \ +else \ + echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ +fi # Create and set working directory WORKDIR /app @@ -120,7 +123,11 @@ RUN pip install --no-cache-dir \ RUN mkdocs build # Install Playwright and browsers -RUN playwright install +RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \ + playwright install chromium; \ + elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ + playwright install chromium; \ + fi # Expose port EXPOSE 8000 11235 9222 8080 diff --git a/main.py b/main.py index 6d217410..d6c792e8 100644 --- a/main.py +++ b/main.py @@ -340,9 +340,6 @@ app.add_middleware( allow_headers=["*"], # Allows all headers ) -# Mount the pages directory as a static directory -app.mount("/pages", StaticFiles(directory=__location__ + "/pages"), name="pages") - # API token security security = HTTPBearer() CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code" @@ -364,7 +361,6 @@ if os.path.exists(__location__ + "/site"): app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs") site_templates = Jinja2Templates(directory=__location__ + "/site") -templates = Jinja2Templates(directory=__location__ + "/pages") crawler_service = CrawlerService() From f9c98a377dd1dda28f88cd5ab4e801535a88abcc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:52:51 +0800 Subject: [PATCH 6/7] Enhance Docker support and improve installation process - Added new Docker commands for platform-specific builds. - Updated README with comprehensive installation and setup instructions. - Introduced `post_install` method in setup script for automation. - Refined migration processes with enhanced error logging. - Bump version to 0.3.746 and updated dependencies. --- CHANGELOG.md | 59 +++++++++++ README.md | 177 +++++++++++++++++++++++++++----- docker-compose.yml | 65 ++++++------ docs/examples/docker_example.py | 22 ++-- 4 files changed, 256 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5ec79639..309218dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,64 @@ # Changelog +## [0.3.746] November 29, 2024 + +### Major Features +1. Enhanced Docker Support (Nov 29, 2024) + - Improved GPU support in Docker images. + - Dockerfile refactored for better platform-specific installations. + - Introduced new Docker commands for different platforms: + - `basic-amd64`, `all-amd64`, `gpu-amd64` for AMD64. + - `basic-arm64`, `all-arm64`, `gpu-arm64` for ARM64. + +### Infrastructure & Documentation +- Enhanced README.md to improve user guidance and installation instructions. +- Added installation instructions for Playwright setup in README. +- Created and updated examples in `docs/examples/quickstart_async.py` to be more useful and user-friendly. +- Updated `requirements.txt` with a new `pydantic` dependency. +- Bumped version number in `crawl4ai/__version__.py` to 0.3.746. + +### Breaking Changes +- Streamlined application structure: + - Removed static pages and related code from `main.py` which might affect existing deployments relying on static content. + +### Development Updates +- Developed `post_install` method in `crawl4ai/install.py` to streamline post-installation setup tasks. +- Refined migration processes in `crawl4ai/migrations.py` with enhanced logging for better error visibility. +- Updated `docker-compose.yml` to support local and hub services for different architectures, enhancing build and deploy capabilities. +- Refactored example test cases in `docs/examples/docker_example.py` to facilitate comprehensive testing. + +### README.md +Updated README with new docker commands and setup instructions. +Enhanced installation instructions and guidance. + +### crawl4ai/install.py +Added post-install script functionality. +Introduced `post_install` method for automation of post-installation tasks. + +### crawl4ai/migrations.py +Improved migration logging. +Refined migration processes and added better logging. + +### docker-compose.yml +Refactored docker-compose for better service management. +Updated to define services for different platforms and versions. + +### requirements.txt +Updated dependencies. +Added `pydantic` to requirements file. + +### crawler/__version__.py +Updated version number. +Bumped version number to 0.3.746. + +### docs/examples/quickstart_async.py +Enhanced example scripts. +Uncommented example usage in async guide for user functionality. + +### main.py +Refactored code to improve maintainability. +Streamlined app structure by removing static pages code. + ## [0.3.743] November 27, 2024 Enhance features and documentation diff --git a/README.md b/README.md index bbfa5858..3d89ee19 100644 --- a/README.md +++ b/README.md @@ -220,48 +220,173 @@ Crawl4AI is available as Docker images for easy deployment. You can either pull --- -### Option 1: Docker Hub (Recommended) +
+🐳 Option 1: Docker Hub (Recommended) +Choose the appropriate image based on your platform and needs: + +### For AMD64 (Regular Linux/Windows): ```bash -# Pull and run from Docker Hub (choose one): -docker pull unclecode/crawl4ai:basic # Basic crawling features -docker pull unclecode/crawl4ai:all # Full installation (ML, LLM support) -docker pull unclecode/crawl4ai:gpu # GPU-enabled version +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-amd64 -# Run the container -docker run -p 11235:11235 unclecode/crawl4ai:basic # Replace 'basic' with your chosen version +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:all-amd64 -# In case you want to set platform to arm64 -docker run --platform linux/arm64 -p 11235:11235 unclecode/crawl4ai:basic - -# In case to allocate more shared memory for the container -docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic +# With GPU support +docker pull unclecode/crawl4ai:gpu-amd64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-amd64 ``` ---- +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Basic version (recommended) +docker pull unclecode/crawl4ai:basic-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:basic-arm64 -### Option 2: Build from Repository +# Full ML/LLM support +docker pull unclecode/crawl4ai:all-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:all-arm64 + +# With GPU support +docker pull unclecode/crawl4ai:gpu-arm64 +docker run -p 11235:11235 unclecode/crawl4ai:gpu-arm64 +``` + +Need more memory? Add `--shm-size`: +```bash +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-amd64 +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +### For Raspberry Pi (32-bit) (Experimental) +```bash +# Pull and run basic version (recommended for Raspberry Pi) +docker pull unclecode/crawl4ai:basic-armv7 +docker run -p 11235:11235 unclecode/crawl4ai:basic-armv7 + +# With increased shared memory if needed +docker run --shm-size=2gb -p 11235:11235 unclecode/crawl4ai:basic-armv7 +``` + +Note: Due to hardware constraints, only the basic version is recommended for Raspberry Pi. + +
+ +
+🐳 Option 2: Build from Repository + +Build the image locally based on your platform: ```bash # Clone the repository git clone https://github.com/unclecode/crawl4ai.git cd crawl4ai -# Build the image -docker build -t crawl4ai:local \ - --build-arg INSTALL_TYPE=basic \ # Options: basic, all +# For AMD64 (Regular Linux/Windows) +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ . -# In case you want to set platform to arm64 -docker build -t crawl4ai:local \ - --build-arg INSTALL_TYPE=basic \ # Options: basic, all - --platform linux/arm64 \ +# For ARM64 (M1/M2 Macs, ARM servers) +docker build --platform linux/arm64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=basic \ . - -# Run your local build -docker run -p 11235:11235 crawl4ai:local ``` +Build options: +- INSTALL_TYPE=basic (default): Basic crawling features +- INSTALL_TYPE=all: Full ML/LLM support +- ENABLE_GPU=true: Add GPU support + +Example with all options: +```bash +docker build --platform linux/amd64 \ + --tag crawl4ai:local \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=true \ + . +``` + +Run your local build: +```bash +# Regular run +docker run -p 11235:11235 crawl4ai:local + +# With increased shared memory +docker run --shm-size=2gb -p 11235:11235 crawl4ai:local +``` + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
+ +
+🐳 Option 3: Using Docker Compose + +Docker Compose provides a more structured way to run Crawl4AI, especially when dealing with environment variables and multiple configurations. + +```bash +# Clone the repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +``` + +### For AMD64 (Regular Linux/Windows): +```bash +# Build and run locally +docker-compose --profile local-amd64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-amd64 up # Basic version +VERSION=all docker-compose --profile hub-amd64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-amd64 up # GPU support +``` + +### For ARM64 (M1/M2 Macs, ARM servers): +```bash +# Build and run locally +docker-compose --profile local-arm64 up + +# Run from Docker Hub +VERSION=basic docker-compose --profile hub-arm64 up # Basic version +VERSION=all docker-compose --profile hub-arm64 up # Full ML/LLM support +VERSION=gpu docker-compose --profile hub-arm64 up # GPU support +``` + +Environment variables (optional): +```bash +# Create a .env file +CRAWL4AI_API_TOKEN=your_token +OPENAI_API_KEY=your_openai_key +CLAUDE_API_KEY=your_claude_key +``` + +The compose file includes: +- Memory management (4GB limit, 1GB reserved) +- Shared memory volume for browser support +- Health checks +- Auto-restart policy +- All necessary port mappings + +Test the installation: +```bash +curl http://localhost:11235/health +``` + +
+ --- ### Quick Test @@ -278,11 +403,11 @@ response = requests.post( ) task_id = response.json()["task_id"] -# Get results +# Continue polling until the task is complete (status="completed") result = requests.get(f"http://localhost:11235/task/{task_id}") ``` -For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). +For more examples, see our [Docker Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py). For advanced configuration, environment variables, and usage examples, see our [Docker Deployment Guide](https://crawl4ai.com/mkdocs/basic/docker-deployment/). diff --git a/docker-compose.yml b/docker-compose.yml index b93beda9..4b22fd98 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,6 @@ services: - crawl4ai: + # Local build services for different platforms + crawl4ai-amd64: build: context: . dockerfile: Dockerfile @@ -7,35 +8,39 @@ services: PYTHON_VERSION: "3.10" INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: false - profiles: ["local"] - ports: - - "11235:11235" - - "8000:8000" - - "9222:9222" - - "8080:8080" - environment: - - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} - - OPENAI_API_KEY=${OPENAI_API_KEY:-} - - CLAUDE_API_KEY=${CLAUDE_API_KEY:-} - volumes: - - /dev/shm:/dev/shm - deploy: - resources: - limits: - memory: 4G - reservations: - memory: 1G - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:11235/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s + platforms: + - linux/amd64 + profiles: ["local-amd64"] + extends: &base-config + file: docker-compose.yml + service: base-config - crawl4ai-hub: - image: unclecode/crawl4ai:basic - profiles: ["hub"] + crawl4ai-arm64: + build: + context: . + dockerfile: Dockerfile + args: + PYTHON_VERSION: "3.10" + INSTALL_TYPE: ${INSTALL_TYPE:-basic} + ENABLE_GPU: false + platforms: + - linux/arm64 + profiles: ["local-arm64"] + extends: *base-config + + # Hub services for different platforms and versions + crawl4ai-hub-amd64: + image: unclecode/crawl4ai:${VERSION:-basic}-amd64 + profiles: ["hub-amd64"] + extends: *base-config + + crawl4ai-hub-arm64: + image: unclecode/crawl4ai:${VERSION:-basic}-arm64 + profiles: ["hub-arm64"] + extends: *base-config + + # Base configuration to be extended + base-config: ports: - "11235:11235" - "8000:8000" @@ -59,4 +64,4 @@ services: interval: 30s timeout: 10s retries: 3 - start_period: 40s + start_period: 40s \ No newline at end of file diff --git a/docs/examples/docker_example.py b/docs/examples/docker_example.py index 17ef9f04..48acc809 100644 --- a/docs/examples/docker_example.py +++ b/docs/examples/docker_example.py @@ -78,20 +78,20 @@ def test_docker_deployment(version="basic"): time.sleep(5) # Test cases based on version - # test_basic_crawl(tester) - # test_basic_crawl(tester) - # test_basic_crawl_sync(tester) test_basic_crawl_direct(tester) + test_basic_crawl(tester) + test_basic_crawl(tester) + test_basic_crawl_sync(tester) - # if version in ["full", "transformer"]: - # test_cosine_extraction(tester) + if version in ["full", "transformer"]: + test_cosine_extraction(tester) - # test_js_execution(tester) - # test_css_selector(tester) - # test_structured_extraction(tester) - # test_llm_extraction(tester) - # test_llm_with_ollama(tester) - # test_screenshot(tester) + test_js_execution(tester) + test_css_selector(tester) + test_structured_extraction(tester) + test_llm_extraction(tester) + test_llm_with_ollama(tester) + test_screenshot(tester) def test_basic_crawl(tester: Crawl4AiTester): From 1def53b7fe60267d5bc1f492f50b5f53f8858eee Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 29 Nov 2024 20:53:43 +0800 Subject: [PATCH 7/7] docs: update Raspberry Pi section to indicate upcoming support --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3d89ee19..405c1002 100644 --- a/README.md +++ b/README.md @@ -265,7 +265,7 @@ Test the installation: curl http://localhost:11235/health ``` -### For Raspberry Pi (32-bit) (Experimental) +### For Raspberry Pi (32-bit) (coming soon): ```bash # Pull and run basic version (recommended for Raspberry Pi) docker pull unclecode/crawl4ai:basic-armv7