diff --git a/README.md b/README.md index 6871adf6..ab38aee6 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,14 @@ Crawl4AI has one clear task: to simplify crawling and extract useful information [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1wz8u30rvbq6Scodye9AGCw8Qg_Z8QGsk) -## Recent Changes v0.2.0 +## Recent Changes +### v0.2.2 +- Support multiple JS scripts +- Fixed some of bugs +- Resolved a few issue relevant to Colab installation + +### v0.2.0 - πŸš€ 10x faster!! - πŸ“œ Execute custom JavaScript before crawling! - 🀝 Colab friendly! diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index a98402bc..60d5c54f 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -103,12 +103,18 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): ) # Execute JS code if provided - if self.js_code: + if self.js_code and type(self.js_code) == str: self.driver.execute_script(self.js_code) # Optionally, wait for some condition after executing the JS code WebDriverWait(self.driver, 10).until( lambda driver: driver.execute_script("return document.readyState") == "complete" ) + elif self.js_code and type(self.js_code) == list: + for js in self.js_code: + self.driver.execute_script(js) + WebDriverWait(self.driver, 10).until( + lambda driver: driver.execute_script("return document.readyState") == "complete" + ) html = self.driver.page_source diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 2d164ff0..a24b5fe5 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -188,14 +188,15 @@ class CosineStrategy(ExtractionStrategy): if self.verbose: print(f"[LOG] Loading Extraction Model for {self.device.type} device.") - if False and self.device.type == "cpu": - self.model = load_onnx_all_MiniLM_l6_v2() - self.tokenizer = self.model.tokenizer - self.get_embedding_method = "direct" - else: - self.tokenizer, self.model = load_bge_small_en_v1_5() - self.model.eval() - self.get_embedding_method = "batch" + # if False and self.device.type == "cpu": + # self.model = load_onnx_all_MiniLM_l6_v2() + # self.tokenizer = self.model.tokenizer + # self.get_embedding_method = "direct" + # else: + + self.tokenizer, self.model = load_bge_small_en_v1_5() + self.model.eval() + self.get_embedding_method = "batch" self.buffer_embeddings = np.array([]) diff --git a/crawl4ai/model_loader.py b/crawl4ai/model_loader.py index 5ff3695e..7e17f7f9 100644 --- a/crawl4ai/model_loader.py +++ b/crawl4ai/model_loader.py @@ -35,8 +35,7 @@ def calculate_batch_size(device): else: return 32 else: - return 16 # Default batch size - + return 16 # Default batch size @lru_cache() def get_device(): @@ -258,8 +257,8 @@ def download_all_models(remove_existing=False): # load_bert_base_uncased() # print("[LOG] Downloading BGE Small EN v1.5...") # load_bge_small_en_v1_5() - print("[LOG] Downloading ONNX model...") - load_onnx_all_MiniLM_l6_v2() + # print("[LOG] Downloading ONNX model...") + # load_onnx_all_MiniLM_l6_v2() print("[LOG] Downloading text classifier...") _, device = load_text_multilabel_classifier() print(f"[LOG] Text classifier loaded on {device}") diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index bfa1dd14..6046c9bb 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -164,6 +164,22 @@ def interactive_extraction(crawler): cprint("[LOG] πŸ“¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") print_result(result) +def multiple_scrip(crawler): + # Passing JavaScript code to interact with the page + cprint("\nπŸ–±οΈ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True) + cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.") + js_code = [""" + const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); + loadMoreButton && loadMoreButton.click(); + """] * 2 + crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code) + crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True) + result = crawler.run( + url="https://www.nbcnews.com/business", + ) + cprint("[LOG] πŸ“¦ [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]") + print_result(result) + def main(): cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]") cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]") @@ -180,6 +196,7 @@ def main(): add_llm_extraction_strategy(crawler) targeted_extraction(crawler) interactive_extraction(crawler) + multiple_scrip(crawler) cprint("\nπŸŽ‰ [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! πŸ•ΈοΈ[/bold green]") diff --git a/requirements.crawl.txt b/requirements.crawl.txt new file mode 100644 index 00000000..d72800cf --- /dev/null +++ b/requirements.crawl.txt @@ -0,0 +1,13 @@ +aiohttp +aiosqlite +bs4 +fastapi +html2text +httpx +pydantic +python-dotenv +requests +rich +selenium +uvicorn +chromedriver-autoinstaller diff --git a/requirements.txt b/requirements.txt index 5de4dc9a..f4fdce65 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,20 +1,20 @@ -aiohttp==3.9.5 -aiosqlite==0.20.0 -bs4==0.0.2 -fastapi==0.111.0 +aiohttp +aiosqlite +bs4 +fastapi html2text -httpx==0.27.0 -litellm==1.37.11 -nltk==3.8.1 -pydantic==2.7.1 -python-dotenv==1.0.1 -requests==2.31.0 -rich==13.7.1 -scikit-learn==1.4.2 -selenium==4.20.0 -uvicorn==0.29.0 -transformers==4.40.2 -chromedriver-autoinstaller==0.6.4 +httpx +litellm +nltk +pydantic +python-dotenv +requests +rich +scikit-learn +selenium +uvicorn +transformers +chromedriver-autoinstaller torch onnxruntime tokenizers diff --git a/setup.py b/setup.py index 1690e140..8f490469 100644 --- a/setup.py +++ b/setup.py @@ -7,11 +7,16 @@ from setuptools.command.install import install with open("requirements.txt") as f: requirements = f.read().splitlines() +# Read the requirements from requirements.txt +with open("requirements.crawl.txt") as f: + requirements_crawl_only = f.read().splitlines() + # Define the requirements for different environments requirements_without_torch = [req for req in requirements if not req.startswith("torch")] requirements_without_transformers = [req for req in requirements if not req.startswith("transformers")] requirements_without_nltk = [req for req in requirements if not req.startswith("nltk")] requirements_without_torch_transformers_nlkt = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")] +requirements_crawl_only = [req for req in requirements if not req.startswith("torch") and not req.startswith("transformers") and not req.startswith("nltk")] class CustomInstallCommand(install): """Customized setuptools install command to install spacy without dependencies.""" @@ -21,7 +26,7 @@ class CustomInstallCommand(install): setup( name="Crawl4AI", - version="0.2.1", + version="0.2.2", description="πŸ”₯πŸ•·οΈ Crawl4AI: Open-source LLM Friendly Web Crawler & Scrapper", long_description=open("README.md").read(), long_description_content_type="text/markdown", @@ -34,7 +39,7 @@ setup( extras_require={ "all": requirements, # Include all requirements "colab": requirements_without_torch, # Exclude torch for Colab - "crawl": requirements_without_torch_transformers_nlkt + "crawl": requirements_crawl_only, # Include only crawl requirements }, cmdclass={ 'install': CustomInstallCommand,