Migrating from the classic setup.py to a using PyProject approach.

2024-11-29 14:45:04 +08:00
parent c0e87abaee
commit 449dd7cc0b
9 changed files with 229 additions and 205 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +0,0 @@
 include requirements.txt
--- a/build_hooks.py
+++ b/build_hooks.py
@@ -0,0 +1,48 @@
 import os
 import shutil
 from pathlib import Path
 import subprocess
 import sys
 from hatchling.builders.hooks.plugin.interface import BuildHookInterface
 PLUGIN = "CustomBuildHook" 
 class CustomBuildHook(BuildHookInterface):
    def initialize(self, version, build_data):
        # Create the .crawl4ai folder structure
        base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
        crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
        crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
        cache_folder = crawl4ai_folder / "cache"
        content_folders = [
            "html_content",
            "cleaned_html",
            "markdown_content",
            "extracted_content",
            "screenshots",
        ]
        # Clean up old cache if exists
        if cache_folder.exists():
            shutil.rmtree(cache_folder)
        # Create new folder structure
        crawl4ai_folder.mkdir(exist_ok=True)
        cache_folder.mkdir(exist_ok=True)
        for folder in content_folders:
            (crawl4ai_folder / folder).mkdir(exist_ok=True)
        # Install Playwright browsers
        try:
            subprocess.check_call([sys.executable, "-m", "playwright", "install"])
        except Exception as e:
            print(f"Warning: Playwright installation failed: {e}")
            print("Please run 'python -m playwright install' manually after installation")
        # Initialize database
        try:
            from crawl4ai.async_database import async_db_manager
            import asyncio
            asyncio.run(async_db_manager.initialize())
        except Exception as e:
            print(f"Warning: Database initialization failed: {e}")
            print("Database will be initialized on first use")
--- a/docs/examples/quickstart_async.py
+++ b/docs/examples/quickstart_async.py
@@ -32,7 +32,7 @@ print("Website: https://crawl4ai.com")
 async def simple_crawl():
    print("\n--- Basic Usage ---")
    async with AsyncWebCrawler(verbose=True) as crawler:
-        result = await crawler.arun(url="https://www.nbcnews.com/business")
+        result = await crawler.arun(url="https://www.nbcnews.com/business", cache_mode= CacheMode.BYPASS)
        print(result.markdown[:500])  # Print first 500 characters
 async def simple_example_with_running_js_code():
@@ -76,16 +76,17 @@ async def use_proxy():
    async with AsyncWebCrawler(verbose=True, proxy="http://your-proxy-url:port") as crawler:
        result = await crawler.arun(
            url="https://www.nbcnews.com/business",
-            bypass_cache=True
+            cache_mode= CacheMode.BYPASS
        )
-        print(result.markdown[:500])  # Print first 500 characters
+        if result.success:
            print(result.markdown[:500])  # Print first 500 characters
 async def capture_and_save_screenshot(url: str, output_path: str):
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
            url=url,
            screenshot=True,
-            bypass_cache=True
+            cache_mode= CacheMode.BYPASS
        )
        if result.success and result.screenshot:
@@ -141,41 +142,68 @@ async def extract_structured_data_using_llm(provider: str, api_token: str = None
 async def extract_structured_data_using_css_extractor():
    print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
    schema = {
-        "name": "Coinbase Crypto Prices",
+    "name": "KidoCode Courses",
-        "baseSelector": ".cds-tableRow-t45thuk",
+    "baseSelector": "section.charge-methodology .w-tab-content > div",
-        "fields": [
+    "fields": [
-            {
+        {
-                "name": "crypto",
+            "name": "section_title",
-                "selector": "td:nth-child(1) h2",
+            "selector": "h3.heading-50",
-                "type": "text",
+            "type": "text",
-            },
+        },
-            {
+        {
-                "name": "symbol",
+            "name": "section_description",
-                "selector": "td:nth-child(1) p",
+            "selector": ".charge-content",
-                "type": "text",
+            "type": "text",
-            },
+        },
-            {
+        {
-                "name": "price",
+            "name": "course_name",
-                "selector": "td:nth-child(2)",
+            "selector": ".text-block-93",
-                "type": "text",
+            "type": "text",
        },
        {
            "name": "course_description",
            "selector": ".course-content-text",
            "type": "text",
        },
        {
            "name": "course_icon",
            "selector": ".image-92",
            "type": "attribute",
            "attribute": "src"
        }
    ]
 }
    async with AsyncWebCrawler(
        headless=True,
        verbose=True
    ) as crawler:
        # Create the JavaScript that handles clicking multiple times
        js_click_tabs = """
        (async () => {
            const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
            for(let tab of tabs) {
                // scroll to the tab
                tab.scrollIntoView();
                tab.click();
                // Wait for content to load and animations to complete
                await new Promise(r => setTimeout(r, 500));
            }
-        ],
+        })();
-    }
+        """     
    extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
    async with AsyncWebCrawler(verbose=True) as crawler:
        result = await crawler.arun(
-            url="https://www.coinbase.com/explore",
+            url="https://www.kidocode.com/degrees/technology",
-            extraction_strategy=extraction_strategy,
+            extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
-            cache_mode=CacheMode.BYPASS,
+            js_code=[js_click_tabs],
            cache_mode=CacheMode.BYPASS
        )
-        assert result.success, "Failed to crawl the page"
+        companies = json.loads(result.extracted_content)
-
+        print(f"Successfully extracted {len(companies)} companies")
-        news_teasers = json.loads(result.extracted_content)
+        print(json.dumps(companies[0], indent=2))
        print(f"Successfully extracted {len(news_teasers)} news teasers")
        print(json.dumps(news_teasers[0], indent=2))
 # Advanced Session-Based Crawling with Dynamic Content 🔄
 async def crawl_dynamic_content_pages_method_1():
@@ -363,21 +391,21 @@ async def crawl_custom_browser_type():
    # Use Firefox
    start = time.time()
    async with AsyncWebCrawler(browser_type="firefox", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)
    # Use WebKit
    start = time.time()
    async with AsyncWebCrawler(browser_type="webkit", verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)
    # Use Chromium (default)
    start = time.time()
    async with AsyncWebCrawler(verbose=True, headless = True) as crawler:
-        result = await crawler.arun(url="https://www.example.com", bypass_cache=True)
+        result = await crawler.arun(url="https://www.example.com", cache_mode= CacheMode.BYPASS)
        print(result.markdown[:500])
        print("Time taken: ", time.time() - start)
@@ -534,29 +562,29 @@ async def fit_markdown_remove_overlay():
 async def main():
-    await simple_crawl()
+    # await simple_crawl()
-    await simple_example_with_running_js_code()
+    # await simple_example_with_running_js_code()
-    await simple_example_with_css_selector()
+    # await simple_example_with_css_selector()
-    await use_proxy()
+    # await use_proxy()
-    await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
+    # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
-    await extract_structured_data_using_css_extractor()
+    # await extract_structured_data_using_css_extractor()
    # LLM extraction examples
    # await extract_structured_data_using_llm()
    # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
    # await extract_structured_data_using_llm("ollama/llama3.2")    
-    await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
+    # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
    # You always can pass custom headers to the extraction strategy
-    custom_headers = {
+    # custom_headers = {
-        "Authorization": "Bearer your-custom-token",
+    #     "Authorization": "Bearer your-custom-token",
-        "X-Custom-Header": "Some-Value"
+    #     "X-Custom-Header": "Some-Value"
-    }
+    # }
-    await extract_structured_data_using_llm(extra_headers=custom_headers)
+    # await extract_structured_data_using_llm(extra_headers=custom_headers)
    # await crawl_dynamic_content_pages_method_1()
    # await crawl_dynamic_content_pages_method_2()
-    await crawl_dynamic_content_pages_method_3()
+    # await crawl_dynamic_content_pages_method_3()
    await crawl_custom_browser_type()
--- a/plugin.py
+++ b/plugin.py
@@ -0,0 +1,9 @@
 from colorama import Fore, Style
 import subprocess
 import sys
 def post_install():
    print(f"\n{Fore.YELLOW}{'='*40}")
    print(f"{Fore.RED}IMPORTANT: Run this command now:")
    print(f"{Fore.GREEN}python -m playwright install")
    print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
--- a/post_install.py
+++ b/post_install.py
@@ -0,0 +1,19 @@
 from colorama import Fore, Style
 import subprocess
 import sys
 import distutils.log as log
 from pathlib import Path
 def main():
    try:
        subprocess.check_call([sys.executable, "-m", "playwright", "install"], 
                            stdout=subprocess.DEVNULL, 
                            stderr=subprocess.DEVNULL)
    except:
        print(f"\n{Fore.YELLOW}{'='*40}")
        print(f"{Fore.RED}IMPORTANT: Run this command now:")
        print(f"{Fore.GREEN}python -m playwright install")
        print(f"{Fore.YELLOW}{'='*40}{Style.RESET_ALL}\n")
 if __name__ == "__main__":
    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,75 @@
 [build-system]
 requires = ["hatchling", "hatch-fancy-pypi-readme>=22.5.0"]
 build-backend = "hatchling.build"
 [project]
 name = "Crawl4AI"
 dynamic = ["version"]
 description = "🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
 readme = "README.md"
 license = "Apache-2.0"
 requires-python = ">=3.7"
 authors = [
    { name = "Unclecode", email = "unclecode@kidocode.com" },
 ]
 classifiers = [
    "Development Status :: 3 - Alpha",
    "Intended Audience :: Developers",
    "License :: OSI Approved :: Apache Software License",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.7",
    "Programming Language :: Python :: 3.8",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
 ]
 dependencies = [
    "aiosqlite~=0.20",
    "html2text~=2024.2",
    "lxml~=5.3",
    "litellm>=1.53.1",
    "numpy>=1.26.0,<3",
    "pillow~=10.4",
    "playwright>=1.49.0",
    "python-dotenv~=1.0",
    "requests~=2.26",
    "beautifulsoup4~=4.12",
    "tf-playwright-stealth>=1.1.0",
    "xxhash~=3.4",
    "rank-bm25~=0.2", 
    "aiofiles>=24.1.0",
    "colorama~=0.4",
    "snowballstemmer~=2.2",
 ]
 [project.optional-dependencies]
 torch = ["torch", "nltk", "scikit-learn"]
 transformer = ["transformers", "tokenizers"]
 cosine = ["torch", "transformers", "nltk"]
 sync = ["selenium"]
 all = [
    "torch",
    "nltk",
    "scikit-learn",
    "transformers",
    "tokenizers",
    "selenium",
 ]
 [project.urls]
 Homepage = "https://github.com/unclecode/crawl4ai"
 Documentation = "https://crawl4ai.com/mkdocs/"
 [project.scripts]
 crawl4ai-download-models = "crawl4ai.model_loader:main"
 crawl4ai-migrate = "crawl4ai.migrations:main"
 crawl4ai-post-install = "crawl4ai.post_install:main"
 [tool.hatch.version]
 path = "crawl4ai/__version__.py"
 [tool.hatch.build.hooks.custom]
 dependencies = ["hatch-fancy-pypi-readme>=22.5.0"]
 path = "build_hooks.py"
 [project.entry-points.hatch]
 crawl4ai = "crawl4ai.plugin:post_install"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,16 +0,0 @@
 aiosqlite~=0.20
 html2text~=2024.2
 lxml~=5.3
 litellm>=1.53.1
 numpy>=1.26.0,<3
 pillow~=10.4
 playwright>=1.49.0
 python-dotenv~=1.0
 requests~=2.26
 beautifulsoup4~=4.12
 tf-playwright-stealth>=1.1.0
 xxhash~=3.4
 rank-bm25~=0.2
 aiofiles>=24.1.0
 colorama~=0.4
 snowballstemmer~=2.2
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +0,0 @@
 [options]
 include_package_data = True
--- a/setup.py
+++ b/setup.py
@@ -1,136 +0,0 @@
 from setuptools import setup, find_packages
 from setuptools.command.install import install
 import os
 from pathlib import Path
 import shutil
 import subprocess
 import sys
 import asyncio
 # Create the .crawl4ai folder in the user's home directory if it doesn't exist
 # If the folder already exists, remove the cache folder
 base_dir = os.getenv("CRAWL4_AI_BASE_DIRECTORY")
 crawl4ai_folder = Path(base_dir) if base_dir else Path.home()
 crawl4ai_folder = crawl4ai_folder / ".crawl4ai"
 cache_folder = crawl4ai_folder / "cache"
 content_folders = [
    "html_content",
    "cleaned_html",
    "markdown_content",
    "extracted_content",
    "screenshots",
 ]
 # Clean up old cache if exists
 if cache_folder.exists():
    shutil.rmtree(cache_folder)
 # Create new folder structure
 crawl4ai_folder.mkdir(exist_ok=True)
 cache_folder.mkdir(exist_ok=True)
 for folder in content_folders:
    (crawl4ai_folder / folder).mkdir(exist_ok=True)
 # Read requirements and version
 __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
 with open(os.path.join(__location__, "requirements.txt")) as f:
    requirements = f.read().splitlines()
 with open("crawl4ai/__version__.py") as f:
    for line in f:
        if line.startswith("__version__"):
            version = line.split("=")[1].strip().strip('"')
            break
 # Define requirements
 default_requirements = requirements
 torch_requirements = ["torch", "nltk", "scikit-learn"]
 transformer_requirements = ["transformers", "tokenizers"]
 cosine_similarity_requirements = ["torch", "transformers", "nltk"]
 sync_requirements = ["selenium"]
 def install_playwright():
    print("Installing Playwright browsers...")
    try:
        subprocess.check_call([sys.executable, "-m", "playwright", "install"])
        print("Playwright installation completed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error during Playwright installation: {e}")
        print(
            "Please run 'python -m playwright install' manually after the installation."
        )
    except Exception as e:
        print(f"Unexpected error during Playwright installation: {e}")
        print(
            "Please run 'python -m playwright install' manually after the installation."
        )
 def run_migration():
    """Initialize database during installation"""
    try:
        print("Starting database initialization...")
        from crawl4ai.async_database import async_db_manager
        asyncio.run(async_db_manager.initialize())
        print("Database initialization completed successfully.")
    except ImportError:
        print("Warning: Database module not found. Will initialize on first use.")
    except Exception as e:
        print(f"Warning: Database initialization failed: {e}")
        print("Database will be initialized on first use")
 class PostInstallCommand(install):
    def run(self):
        install.run(self)
        install_playwright()
        # run_migration()
 setup(
    name="Crawl4AI",
    version=version,
    description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper",
    long_description=open("README.md", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    url="https://github.com/unclecode/crawl4ai",
    author="Unclecode",
    author_email="unclecode@kidocode.com",
    license="MIT",
    packages=find_packages(),
    install_requires=default_requirements
    + ["playwright", "aiofiles"],  # Added aiofiles
    extras_require={
        "torch": torch_requirements,
        "transformer": transformer_requirements,
        "cosine": cosine_similarity_requirements,
        "sync": sync_requirements,
        "all": default_requirements
        + torch_requirements
        + transformer_requirements
        + cosine_similarity_requirements
        + sync_requirements,
    },
    entry_points={
        "console_scripts": [
            "crawl4ai-download-models=crawl4ai.model_loader:main",
            "crawl4ai-migrate=crawl4ai.migrations:main",  # Added migration command
        ],
    },
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Intended Audience :: Developers",
        "License :: OSI Approved :: Apache Software License",
        "Programming Language :: Python :: 3",
        "Programming Language :: Python :: 3.7",
        "Programming Language :: Python :: 3.8",
        "Programming Language :: Python :: 3.9",
        "Programming Language :: Python :: 3.10",
    ],
    python_requires=">=3.7",
    cmdclass={
        "install": PostInstallCommand,
    },
 )