refactor: Update LocalSeleniumCrawlerStrategy to execute JS code if provided

add use proxy and llm baseurl examples
replace base64 image url to ''
2024-09-01 16:34:51 +08:00 · 2024-08-27 10:14:54 +08:00 · 2024-08-27 09:44:35 +08:00 · 2024-08-26 16:12:49 +08:00 · 2024-08-26 15:29:23 +08:00
8 changed files with 57 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -189,6 +189,4 @@ a.txt
 .lambda_function.py
 ec2*

-update_changelog.sh
-test_env/
-tmp/
+update_changelog.sh
--- a/README.md
+++ b/README.md
@@ -190,6 +190,33 @@ result = crawler.run(
 print(result.extracted_content)
 ```

+### Extract Structured Data from Web Pages With Proxy and BaseUrl
+
+```python
+from crawl4ai import WebCrawler
+from crawl4ai.extraction_strategy import LLMExtractionStrategy
+
+def create_crawler():
+    crawler = WebCrawler(verbose=True, proxy="http://127.0.0.1:7890")
+    crawler.warmup()
+    return crawler
+
+crawler = create_crawler()
+
+crawler.warmup()
+
+result = crawler.run(
+    url="https://www.nbcnews.com/business",
+    extraction_strategy=LLMExtractionStrategy(
+        provider="openai/gpt-4o",
+        api_token="sk-",
+        base_url="https://api.openai.com/v1"
+    )
+)
+
+print(result.markdown)
+```
+
 ## Documentation 📚

 For detailed documentation, including installation instructions, advanced features, and API reference, visit our [Documentation Website](https://crawl4ai.com/mkdocs/).
--- a/crawl4ai/crawler_strategy.py
+++ b/crawl4ai/crawler_strategy.py
@@ -82,6 +82,8 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
        print("[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy")
        self.options = Options()
        self.options.headless = True
+        if kwargs.get("proxy"):
+            self.options.add_argument("--proxy-server={}".format(kwargs.get("proxy")))
        if kwargs.get("user_agent"):
            self.options.add_argument("--user-agent=" + kwargs.get("user_agent"))
        else:
@@ -242,6 +244,7 @@ class LocalSeleniumCrawlerStrategy(CrawlerStrategy):
                driver.quit()
            
            # Execute JS code if provided
+            self.js_code = kwargs.get("js_code", self.js_code)
            if self.js_code and type(self.js_code) == str:
                self.driver.execute_script(self.js_code)
                # Optionally, wait for some condition after executing the JS code
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -79,6 +79,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
        self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
        self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
        self.apply_chunking = kwargs.get("apply_chunking", True)
+        self.base_url = kwargs.get("base_url", None)
        if not self.apply_chunking:
            self.chunk_token_threshold = 1e9
        
@@ -101,7 +102,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
            variable_values["REQUEST"] = self.instruction
            prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
            
-        if self.extract_type == "schema" and self.schema:
+        if self.extract_type == "schema":
            variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
            prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION

@@ -110,7 +111,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
                "{" + variable + "}", variable_values[variable]
            )
        
-        response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token) # , json_response=self.extract_type == "schema")
+        response = perform_completion_with_backoff(self.provider, prompt_with_variables, self.api_token, base_url=self.base_url) # , json_response=self.extract_type == "schema")
        try:
            blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
            blocks = json.loads(blocks)
--- a/crawl4ai/prompts.py
+++ b/crawl4ai/prompts.py
@@ -29,7 +29,7 @@ To generate the JSON objects:

 5. Make sure the generated JSON is complete and parsable, with no errors or omissions.

-6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
+6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.

 Please provide your output within <blocks> tags, like this:

@@ -87,7 +87,7 @@ To generate the JSON objects:

 5. Make sure the generated JSON is complete and parsable, with no errors or omissions.

-6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
+6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.

 7. Never alter the extracted content, just copy and paste it as it is.

@@ -142,7 +142,7 @@ To generate the JSON objects:

 5. Make sure the generated JSON is complete and parsable, with no errors or omissions.

-6. Make sur to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.
+6. Make sure to escape any special characters in the HTML content, and also single or double quote to avoid JSON parsing issues.

 7. Never alter the extracted content, just copy and paste it as it is.

@@ -201,4 +201,4 @@ Avoid Common Mistakes:
 - Do not generate the Python coee show me how to do the task, this is your task to extract the information and return it in JSON format.

 Result
-Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
+Output the final list of JSON objects, wrapped in <blocks>...</blocks> XML tags. Make sure to close the tag properly."""
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -634,7 +634,12 @@ def get_content_of_website_optimized(url: str, html: str, word_count_threshold:
        return node

    body = flatten_nested_elements(body)
-
+    base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
+    for img in imgs:
+        src = img.get('src', '')
+        if base64_pattern.match(src):
+            # Replace base64 data with empty string
+            img['src'] = base64_pattern.sub('', src)
    cleaned_html = str(body).replace('\n\n', '\n').replace('  ', ' ')
    cleaned_html = sanitize_html(cleaned_html)

@@ -716,7 +721,7 @@ def extract_xml_data(tags, string):
    return data
    
 # Function to perform the completion with exponential backoff
-def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False):
+def perform_completion_with_backoff(provider, prompt_with_variables, api_token, json_response = False, base_url=None):
    from litellm import completion 
    from litellm.exceptions import RateLimitError
    max_attempts = 3
@@ -735,6 +740,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
                ],
                temperature=0.01,
                api_key=api_token,
+                base_url=base_url,
                **extra_args
            )
            return response  # Return the successful response
@@ -755,7 +761,7 @@ def perform_completion_with_backoff(provider, prompt_with_variables, api_token,
                    "content": ["Rate limit error. Please try again later."]
                }]
    
-def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
+def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None, base_url = None):
    # api_token = os.getenv('GROQ_API_KEY', None) if not api_token else api_token
    api_token = PROVIDER_MODELS.get(provider, None) if not api_token else api_token
    
@@ -770,7 +776,7 @@ def extract_blocks(url, html, provider = DEFAULT_PROVIDER, api_token = None):
            "{" + variable + "}", variable_values[variable]
        )
        
-    response = perform_completion_with_backoff(provider, prompt_with_variables, api_token)
+    response = perform_completion_with_backoff(provider, prompt_with_variables, api_token, base_url=base_url)
        
    try:
        blocks = extract_xml_data(["blocks"], response.choices[0].message.content)['blocks']
@@ -834,6 +840,7 @@ def extract_blocks_batch(batch_data, provider = "groq/llama3-70b-8192", api_toke
    
    return sum(all_blocks, [])

+
 def merge_chunks_based_on_token_threshold(chunks, token_threshold):
    """
    Merges small chunks into larger ones based on the total token threshold.
@@ -863,22 +870,23 @@ def merge_chunks_based_on_token_threshold(chunks, token_threshold):

    return merged_sections

-def process_sections(url: str, sections: list, provider: str, api_token: str) -> list:
+def process_sections(url: str, sections: list, provider: str, api_token: str, base_url=None) -> list:
    extracted_content = []
    if provider.startswith("groq/"):
        # Sequential processing with a delay
        for section in sections:
-            extracted_content.extend(extract_blocks(url, section, provider, api_token))
+            extracted_content.extend(extract_blocks(url, section, provider, api_token, base_url=base_url))
            time.sleep(0.5)  # 500 ms delay between each processing
    else:
        # Parallel processing using ThreadPoolExecutor
        with ThreadPoolExecutor() as executor:
-            futures = [executor.submit(extract_blocks, url, section, provider, api_token) for section in sections]
+            futures = [executor.submit(extract_blocks, url, section, provider, api_token, base_url=base_url) for section in sections]
            for future in as_completed(futures):
                extracted_content.extend(future.result())
    
    return extracted_content

+
 def wrap_text(draw, text, font, max_width):
    # Wrap the text to fit within the specified width
    lines = []
@@ -890,6 +898,7 @@ def wrap_text(draw, text, font, max_width):
        lines.append(line)
    return '\n'.join(lines)

+
 def format_html(html_string):
    soup = BeautifulSoup(html_string, 'html.parser')
    return soup.prettify()
--- a/crawl4ai/web_crawler.py
+++ b/crawl4ai/web_crawler.py
@@ -22,9 +22,10 @@ class WebCrawler:
        crawler_strategy: CrawlerStrategy = None,
        always_by_pass_cache: bool = False,
        verbose: bool = False,
+        proxy: str = None,
    ):
        # self.db_path = db_path
-        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
+        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose, proxy=proxy)
        self.always_by_pass_cache = always_by_pass_cache

        # Create the .crawl4ai folder in the user's home directory if it doesn't exist
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@ with open("requirements.txt") as f:
    requirements = f.read().splitlines()

 # Define the requirements for different environments
-default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn"))]
+default_requirements = [req for req in requirements if not req.startswith(("torch", "transformers", "onnxruntime", "nltk", "spacy", "tokenizers", "scikit-learn", "numpy"))]
 torch_requirements = [req for req in requirements if req.startswith(("torch", "nltk", "spacy", "scikit-learn", "numpy"))]
 transformer_requirements = [req for req in requirements if req.startswith(("transformers", "tokenizers", "onnxruntime"))]
Author	SHA1	Message	Date
unclecode	3caf48c9be	refactor: Update LocalSeleniumCrawlerStrategy to execute JS code if provided	2024-09-01 16:34:51 +08:00
datehoer	2ba70b9501	add use proxy and llm baseurl examples	2024-08-27 10:14:54 +08:00
datehoer	16f98cebc0	replace base64 image url to ''	2024-08-27 09:44:35 +08:00
datehoer	fe9ff498ce	add proxy and add ai base_url	2024-08-26 16:12:49 +08:00
Datehoer	eba831ca30	fix spelling mistake	2024-08-26 15:29:23 +08:00