From eba831ca30c7c3e97874c9c8a3acb1ec9df0ce95 Mon Sep 17 00:00:00 2001 From: Datehoer <62844803+datehoer@users.noreply.github.com> Date: Mon, 26 Aug 2024 15:29:23 +0800 Subject: [PATCH 1/5] fix spelling mistake --- crawl4ai/prompts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index 323c4774..a55d6fca 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -29,7 +29,7 @@ To generate the JSON objects: 5. Make sure the generated JSON is complete and parsable, with no errors or omissions. -6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. +6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. Please provide your output within tags, like this: @@ -87,7 +87,7 @@ To generate the JSON objects: 5. Make sure the generated JSON is complete and parsable, with no errors or omissions. -6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. +6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. 7. Never alter the extracted content, just copy and paste it as it is. @@ -142,7 +142,7 @@ To generate the JSON objects: 5. Make sure the generated JSON is complete and parsable, with no errors or omissions. -6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. +6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues. 7. Never alter the extracted content, just copy and paste it as it is. @@ -201,4 +201,4 @@ Avoid Common Mistakes: - Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format. Result -Output the final list of JSON objects, wrapped in ... XML tags. Make sure to close the tag properly.""" \ No newline at end of file +Output the final list of JSON objects, wrapped in ... XML tags. Make sure to close the tag properly.""" From fe9ff498ce1cbb3f453473c1721dfd306e60f3ee Mon Sep 17 00:00:00 2001 From: datehoer Date: Mon, 26 Aug 2024 16:12:49 +0800 Subject: [PATCH 2/5] add proxy and add ai base_url --- crawl4ai/crawler_strategy.py | 2 ++ crawl4ai/extraction_strategy.py | 3 ++- crawl4ai/utils.py | 13 +++++++------ crawl4ai/web_crawler.py | 3 ++- 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index fb7980d3..66a8f7dd 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -82,6 +82,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy") self.options = Options() self.options.headless = True + if kwargs.get("proxy"): + self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy"))) if kwargs.get("user_agent"): self.options.add_argument("--user-agent=" + kwargs.get("user_agent")) else: diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 080229f4..8096f11c 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -79,6 +79,7 @@ class LLMExtractionStrategy(ExtractionStrategy): self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) self.apply_chunking = kwargs.get("apply_chunking", True) + self.base_url = kwargs.get("base_url", None) if not self.apply_chunking: self.chunk_token_threshold = 1e9 @@ -110,7 +111,7 @@ class LLMExtractionStrategy(ExtractionStrategy): "{" + variable + "}", variable_values[variable] ) - response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token) # , json_response=self.extract_type == "schema") + response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema") try: blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] blocks = json.loads(blocks) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 07832888..64ce9f57 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -716,7 +716,7 @@ def extract_xml_data(tags, string): return data # Function to perform the completion with exponential backoff -def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False): +def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None): from litellm import completion from litellm.exceptions import RateLimitError max_attempts = 3 @@ -735,6 +735,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token, ], temperature=0.01, api_key=api_token, + base_url=base_url, **extra_args ) return response # Return the successful response @@ -755,7 +756,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token, "content": ["Rate limit error. Please try again later."] }] -def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None): +def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None): # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token @@ -770,7 +771,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None): "{" + variable + "}", variable_values[variable] ) - response = perform_completion_with_backoff(provider, prompt_with_variables, api_token) + response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url) try: blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks'] @@ -864,17 +865,17 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold): return merged_sections -def process_sections(url: str, sections: list, provider: str, api_token: str) -> list: +def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list: extracted_content = [] if provider.startswith("groq/"): # Sequential processing with a delay for section in sections: - extracted_content.extend(extract_blocks(url, section, provider, api_token)) + extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url)) time.sleep(0.5) # 500 ms delay between each processing else: # Parallel processing using ThreadPoolExecutor with ThreadPoolExecutor() as executor: - futures = [executor.submit(extract_blocks, url, section, provider, api_token) for section in sections] + futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections] for future in as_completed(futures): extracted_content.extend(future.result()) diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index db0d9856..b354b5cd 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -22,9 +22,10 @@ class WebCrawler: crawler_strategy: CrawlerStrategy = None, always_by_pass_cache: bool = False, verbose: bool = False, + proxy: str = None, ): # self.db_path = db_path - self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose) + self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose, proxy=proxy) self.always_by_pass_cache = always_by_pass_cache # Create the .crawl4ai folder in the user's home directory if it doesn't exist From 16f98cebc0b0e75c0842aa4d13e45cbaea9ec8af Mon Sep 17 00:00:00 2001 From: datehoer Date: Tue, 27 Aug 2024 09:44:35 +0800 Subject: [PATCH 3/5] replace base64 image url to '' --- crawl4ai/utils.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 64ce9f57..2ea6fec7 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -634,7 +634,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold: return node body = flatten_nested_elements(body) - + base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)') + for img in imgs: + src = img.get('src', '') + if base64_pattern.match(src): + # Replace base64 data with empty string + img['src'] = base64_pattern.sub('', src) cleaned_html = str(body).replace('\n\n', '\n').replace(' ', ' ') cleaned_html = sanitize_html(cleaned_html) From 2ba70b95018756077cb26ab90a76f6cc019aa7b3 Mon Sep 17 00:00:00 2001 From: datehoer Date: Tue, 27 Aug 2024 10:14:54 +0800 Subject: [PATCH 4/5] add use proxy and llm baseurl examples --- README.md | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.md b/README.md index f2975ad7..6bbef7e4 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,33 @@ result = crawler.run( print(result.extracted_content) ``` +### Extract Structured Data from Web Pages With Proxy and BaseUrl + +```python +from crawl4ai import WebCrawler +from crawl4ai.extraction_strategy import LLMExtractionStrategy + +def create_crawler(): + crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890") + crawler.warmup() + return crawler + +crawler = create_crawler() + +crawler.warmup() + +result = crawler.run( + url="https://www.nbcnews.com/business", + extraction_strategy=LLMExtractionStrategy( + provider="openai/gpt-4o", + api_token="sk-", + base_url="https://api.openai.com/v1" + ) +) + +print(result.markdown) +``` + ## Documentation 📚 For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/). From 3caf48c9be3e289e8e918c185486969f011ee9a5 Mon Sep 17 00:00:00 2001 From: unclecode Date: Sun, 1 Sep 2024 16:34:51 +0800 Subject: [PATCH 5/5] refactor: Update LocalSeleniumCrawlerStrategy to execute JS code if provided --- crawl4ai/crawler_strategy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/crawl4ai/crawler_strategy.py b/crawl4ai/crawler_strategy.py index 66a8f7dd..4d049069 100644 --- a/crawl4ai/crawler_strategy.py +++ b/crawl4ai/crawler_strategy.py @@ -244,6 +244,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy): driver.quit() # Execute JS code if provided + self.js_code = kwargs.get("js_code", self.js_code) if self.js_code and type(self.js_code) == str: self.driver.execute_script(self.js_code) # Optionally, wait for some condition after executing the JS code