diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 540b7204..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1 +0,0 @@ -include requirements.txt \ No newline at end of file diff --git a/build_hooks.py b/build_hooks.py new file mode 100644 index 00000000..e59b5910 --- /dev/null +++ b/build_hooks.py @@ -0,0 +1,48 @@ +import os +import shutil +from pathlib import Path +import subprocess +import sys +from hatchling.builders.hooks.plugin.interface import BuildHookInterface +PLUGIN = "CustomBuildHook" + +class CustomBuildHook(BuildHookInterface): + def initialize(self, version, build_data): + # Create the .crawl4ai folder structure + base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") + crawl4ai_folder = Path(base_dir) if base_dir else Path.home() + crawl4ai_folder = crawl4ai_folder / ".crawl4ai" + cache_folder = crawl4ai_folder / "cache" + content_folders = [ + "html_content", + "cleaned_html", + "markdown_content", + "extracted_content", + "screenshots", + ] + + # Clean up old cache if exists + if cache_folder.exists(): + shutil.rmtree(cache_folder) + + # Create new folder structure + crawl4ai_folder.mkdir(exist_ok=True) + cache_folder.mkdir(exist_ok=True) + for folder in content_folders: + (crawl4ai_folder / folder).mkdir(exist_ok=True) + + # Install Playwright browsers + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"]) + except Exception as e: + print(f"Warning: Playwright installation failed: {e}") + print("Please run 'python -m playwright install' manually after installation") + + # Initialize database + try: + from crawl4ai.async_database import async_db_manager + import asyncio + asyncio.run(async_db_manager.initialize()) + except Exception as e: + print(f"Warning: Database initialization failed: {e}") + print("Database will be initialized on first use") \ No newline at end of file diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py index 9f1eff53..01f7677c 100644 --- a/docs/examples/quickstart_async.py +++ b/docs/examples/quickstart_async.py @@ -32,7 +32,7 @@ print("Website: https://crawl4ai.com") async def simple_crawl(): print("\n--- Basic Usage ---") async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun(url="https://www.nbcnews.com/business") + result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) # Print first 500 characters async def simple_example_with_running_js_code(): @@ -76,16 +76,17 @@ async def use_proxy(): async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler: result = await crawler.arun( url="https://www.nbcnews.com/business", - bypass_cache=True + cache_mode= CacheMode.BYPASS ) - print(result.markdown[:500]) # Print first 500 characters + if result.success: + print(result.markdown[:500]) # Print first 500 characters async def capture_and_save_screenshot(url: str, output_path: str): async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( url=url, screenshot=True, - bypass_cache=True + cache_mode= CacheMode.BYPASS ) if result.success and result.screenshot: @@ -141,41 +142,68 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") schema = { - "name": "Coinbase Crypto Prices", - "baseSelector": ".cds-tableRow-t45thuk", - "fields": [ - { - "name": "crypto", - "selector": "td:nth-child(1) h2", - "type": "text", - }, - { - "name": "symbol", - "selector": "td:nth-child(1) p", - "type": "text", - }, - { - "name": "price", - "selector": "td:nth-child(2)", - "type": "text", + "name": "KidoCode Courses", + "baseSelector": "section.charge-methodology .w-tab-content > div", + "fields": [ + { + "name": "section_title", + "selector": "h3.heading-50", + "type": "text", + }, + { + "name": "section_description", + "selector": ".charge-content", + "type": "text", + }, + { + "name": "course_name", + "selector": ".text-block-93", + "type": "text", + }, + { + "name": "course_description", + "selector": ".course-content-text", + "type": "text", + }, + { + "name": "course_icon", + "selector": ".image-92", + "type": "attribute", + "attribute": "src" + } + ] +} + + async with AsyncWebCrawler( + headless=True, + verbose=True + ) as crawler: + + # Create the JavaScript that handles clicking multiple times + js_click_tabs = """ + (async () => { + const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div"); + + for(let tab of tabs) { + // scroll to the tab + tab.scrollIntoView(); + tab.click(); + // Wait for content to load and animations to complete + await new Promise(r => setTimeout(r, 500)); } - ], - } + })(); + """ - extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True) - - async with AsyncWebCrawler(verbose=True) as crawler: result = await crawler.arun( - url="https://www.coinbase.com/explore", - extraction_strategy=extraction_strategy, - cache_mode=CacheMode.BYPASS, + url="https://www.kidocode.com/degrees/technology", + extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True), + js_code=[js_click_tabs], + cache_mode=CacheMode.BYPASS ) - assert result.success, "Failed to crawl the page" - - news_teasers = json.loads(result.extracted_content) - print(f"Successfully extracted {len(news_teasers)} news teasers") - print(json.dumps(news_teasers[0], indent=2)) + companies = json.loads(result.extracted_content) + print(f"Successfully extracted {len(companies)} companies") + print(json.dumps(companies[0], indent=2)) # Advanced Session-Based Crawling with Dynamic Content 🔄 async def crawl_dynamic_content_pages_method_1(): @@ -363,21 +391,21 @@ async def crawl_custom_browser_type(): # Use Firefox start = time.time() async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use WebKit start = time.time() async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) # Use Chromium (default) start = time.time() async with AsyncWebCrawler(verbose=True, headless = True) as crawler: - result = await crawler.arun(url="https://www.example.com", bypass_cache=True) + result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS) print(result.markdown[:500]) print("Time taken: ", time.time() - start) @@ -534,29 +562,29 @@ async def fit_markdown_remove_overlay(): async def main(): - await simple_crawl() - await simple_example_with_running_js_code() - await simple_example_with_css_selector() - await use_proxy() - await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) - await extract_structured_data_using_css_extractor() + # await simple_crawl() + # await simple_example_with_running_js_code() + # await simple_example_with_css_selector() + # await use_proxy() + # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg")) + # await extract_structured_data_using_css_extractor() # LLM extraction examples # await extract_structured_data_using_llm() # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY")) # await extract_structured_data_using_llm("ollama/llama3.2") - await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) + # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY")) # You always can pass custom headers to the extraction strategy - custom_headers = { - "Authorization": "Bearer your-custom-token", - "X-Custom-Header": "Some-Value" - } - await extract_structured_data_using_llm(extra_headers=custom_headers) + # custom_headers = { + # "Authorization": "Bearer your-custom-token", + # "X-Custom-Header": "Some-Value" + # } + # await extract_structured_data_using_llm(extra_headers=custom_headers) # await crawl_dynamic_content_pages_method_1() # await crawl_dynamic_content_pages_method_2() - await crawl_dynamic_content_pages_method_3() + # await crawl_dynamic_content_pages_method_3() await crawl_custom_browser_type() diff --git a/plugin.py b/plugin.py new file mode 100644 index 00000000..1e1b11bf --- /dev/null +++ b/plugin.py @@ -0,0 +1,9 @@ +from colorama import Fore, Style +import subprocess +import sys + +def post_install(): + print(f"\n{Fore.YELLOW}{'='*40}") + print(f"{Fore.RED}IMPORTANT: Run this command now:") + print(f"{Fore.GREEN}python -m playwright install") + print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") \ No newline at end of file diff --git a/post_install.py b/post_install.py new file mode 100644 index 00000000..e536e547 --- /dev/null +++ b/post_install.py @@ -0,0 +1,19 @@ +from colorama import Fore, Style +import subprocess +import sys +import distutils.log as log +from pathlib import Path + +def main(): + try: + subprocess.check_call([sys.executable, "-m", "playwright", "install"], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL) + except: + print(f"\n{Fore.YELLOW}{'='*40}") + print(f"{Fore.RED}IMPORTANT: Run this command now:") + print(f"{Fore.GREEN}python -m playwright install") + print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..cfef8101 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,75 @@ +[build-system] +requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"] +build-backend = "hatchling.build" + +[project] +name = "Crawl4AI" +dynamic = ["version"] +description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.7" +authors = [ + { name = "Unclecode", email = "unclecode@kidocode.com" }, +] +classifiers = [ + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", +] +dependencies = [ + "aiosqlite~=0.20", + "html2text~=2024.2", + "lxml~=5.3", + "litellm>=1.53.1", + "numpy>=1.26.0,<3", + "pillow~=10.4", + "playwright>=1.49.0", + "python-dotenv~=1.0", + "requests~=2.26", + "beautifulsoup4~=4.12", + "tf-playwright-stealth>=1.1.0", + "xxhash~=3.4", + "rank-bm25~=0.2", + "aiofiles>=24.1.0", + "colorama~=0.4", + "snowballstemmer~=2.2", +] + +[project.optional-dependencies] +torch = ["torch", "nltk", "scikit-learn"] +transformer = ["transformers", "tokenizers"] +cosine = ["torch", "transformers", "nltk"] +sync = ["selenium"] +all = [ + "torch", + "nltk", + "scikit-learn", + "transformers", + "tokenizers", + "selenium", +] + +[project.urls] +Homepage = "https://github.com/unclecode/crawl4ai" +Documentation = "https://crawl4ai.com/mkdocs/" + +[project.scripts] +crawl4ai-download-models = "crawl4ai.model_loader:main" +crawl4ai-migrate = "crawl4ai.migrations:main" +crawl4ai-post-install = "crawl4ai.post_install:main" + +[tool.hatch.version] +path = "crawl4ai/__version__.py" + +[tool.hatch.build.hooks.custom] +dependencies = ["hatch-fancy-pypi-readme>=22.5.0"] +path = "build_hooks.py" + +[project.entry-points.hatch] +crawl4ai = "crawl4ai.plugin:post_install" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c0f6f183..00000000 --- a/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -aiosqlite~=0.20 -html2text~=2024.2 -lxml~=5.3 -litellm>=1.53.1 -numpy>=1.26.0,<3 -pillow~=10.4 -playwright>=1.49.0 -python-dotenv~=1.0 -requests~=2.26 -beautifulsoup4~=4.12 -tf-playwright-stealth>=1.1.0 -xxhash~=3.4 -rank-bm25~=0.2 -aiofiles>=24.1.0 -colorama~=0.4 -snowballstemmer~=2.2 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 56490d6a..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[options] -include_package_data = True \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index d44169bf..00000000 --- a/setup.py +++ /dev/null @@ -1,136 +0,0 @@ -from setuptools import setup, find_packages -from setuptools.command.install import install -import os -from pathlib import Path -import shutil -import subprocess -import sys -import asyncio - -# Create the .crawl4ai folder in the user's home directory if it doesn't exist -# If the folder already exists, remove the cache folder -base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY") -crawl4ai_folder = Path(base_dir) if base_dir else Path.home() -crawl4ai_folder = crawl4ai_folder / ".crawl4ai" -cache_folder = crawl4ai_folder / "cache" -content_folders = [ - "html_content", - "cleaned_html", - "markdown_content", - "extracted_content", - "screenshots", -] - -# Clean up old cache if exists -if cache_folder.exists(): - shutil.rmtree(cache_folder) - -# Create new folder structure -crawl4ai_folder.mkdir(exist_ok=True) -cache_folder.mkdir(exist_ok=True) -for folder in content_folders: - (crawl4ai_folder / folder).mkdir(exist_ok=True) - -# Read requirements and version -__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) -with open(os.path.join(__location__, "requirements.txt")) as f: - requirements = f.read().splitlines() - -with open("crawl4ai/__version__.py") as f: - for line in f: - if line.startswith("__version__"): - version = line.split("=")[1].strip().strip('"') - break - -# Define requirements -default_requirements = requirements -torch_requirements = ["torch", "nltk", "scikit-learn"] -transformer_requirements = ["transformers", "tokenizers"] -cosine_similarity_requirements = ["torch", "transformers", "nltk"] -sync_requirements = ["selenium"] - - -def install_playwright(): - print("Installing Playwright browsers...") - try: - subprocess.check_call([sys.executable, "-m", "playwright", "install"]) - print("Playwright installation completed successfully.") - except subprocess.CalledProcessError as e: - print(f"Error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - except Exception as e: - print(f"Unexpected error during Playwright installation: {e}") - print( - "Please run 'python -m playwright install' manually after the installation." - ) - - -def run_migration(): - """Initialize database during installation""" - try: - print("Starting database initialization...") - from crawl4ai.async_database import async_db_manager - - asyncio.run(async_db_manager.initialize()) - print("Database initialization completed successfully.") - except ImportError: - print("Warning: Database module not found. Will initialize on first use.") - except Exception as e: - print(f"Warning: Database initialization failed: {e}") - print("Database will be initialized on first use") - - -class PostInstallCommand(install): - def run(self): - install.run(self) - install_playwright() - # run_migration() - - -setup( - name="Crawl4AI", - version=version, - description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper", - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type="text/markdown", - url="https://github.com/unclecode/crawl4ai", - author="Unclecode", - author_email="unclecode@kidocode.com", - license="MIT", - packages=find_packages(), - install_requires=default_requirements - + ["playwright", "aiofiles"], # Added aiofiles - extras_require={ - "torch": torch_requirements, - "transformer": transformer_requirements, - "cosine": cosine_similarity_requirements, - "sync": sync_requirements, - "all": default_requirements - + torch_requirements - + transformer_requirements - + cosine_similarity_requirements - + sync_requirements, - }, - entry_points={ - "console_scripts": [ - "crawl4ai-download-models=crawl4ai.model_loader:main", - "crawl4ai-migrate=crawl4ai.migrations:main", # Added migration command - ], - }, - classifiers=[ - "Development Status :: 3 - Alpha", - "Intended Audience :: Developers", - "License :: OSI Approved :: Apache Software License", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - ], - python_requires=">=3.7", - cmdclass={ - "install": PostInstallCommand, - }, -)