diff --git a/.gitignore b/.gitignore index ba67f66f..1e16241b 100644 --- a/.gitignore +++ b/.gitignore @@ -266,4 +266,5 @@ tests/**/benchmark_reports docs/**/data .codecat/ -docs/apps/linkdin/debug*/ \ No newline at end of file +docs/apps/linkdin/debug*/ +docs/apps/linkdin/samples/insights/* \ No newline at end of file diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 7a04fd04..3fcd9911 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1336,7 +1336,7 @@ class LLMConfig: provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, base_url: Optional[str] = None, - temprature: Optional[float] = None, + temperature: Optional[float] = None, max_tokens: Optional[int] = None, top_p: Optional[float] = None, frequency_penalty: Optional[float] = None, @@ -1364,7 +1364,7 @@ class LLMConfig: self.provider = DEFAULT_PROVIDER self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY) self.base_url = base_url - self.temprature = temprature + self.temperature = temperature self.max_tokens = max_tokens self.top_p = top_p self.frequency_penalty = frequency_penalty @@ -1378,7 +1378,7 @@ class LLMConfig: provider=kwargs.get("provider", DEFAULT_PROVIDER), api_token=kwargs.get("api_token"), base_url=kwargs.get("base_url"), - temprature=kwargs.get("temprature"), + temperature=kwargs.get("temperature"), max_tokens=kwargs.get("max_tokens"), top_p=kwargs.get("top_p"), frequency_penalty=kwargs.get("frequency_penalty"), @@ -1392,7 +1392,7 @@ class LLMConfig: "provider": self.provider, "api_token": self.api_token, "base_url": self.base_url, - "temprature": self.temprature, + "temperature": self.temperature, "max_tokens": self.max_tokens, "top_p": self.top_p, "frequency_penalty": self.frequency_penalty, diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 561dea21..6ee43961 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -255,6 +255,13 @@ class ManagedBrowser: preexec_fn=os.setpgrp # Start in a new process group ) + # If verbose is True print args used to run the process + if self.logger and self.browser_config.verbose: + self.logger.debug( + f"Starting browser with args: {' '.join(args)}", + tag="BROWSER" + ) + # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring await asyncio.sleep(0.5) # Give browser time to start await self._initial_startup_check() diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 245abc54..966f333e 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -541,7 +541,7 @@ class LLMExtractionStrategy(ExtractionStrategy): api_token: The API token for the provider. base_url: The base URL for the API request. api_base: The base URL for the API request. - extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc. """ super().__init__( input_format=input_format, **kwargs) self.llm_config = llm_config @@ -1168,7 +1168,11 @@ In this scenario, use your best judgment to generate the schema. You need to exa elif not query and not target_json_example: user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content.""" - user_message["content"] += """IMPORTANT: Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads. + user_message["content"] += """IMPORTANT: + 0/ Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads. + 1/ DO NOT USE use base64 kind of classes, they are temporary and not reliable. + 2/ Every selector must refer to only one unique element. You should ensure your selector points to a single element and is unique to the place that contains the information. You have to use available techniques based on CSS or XPATH requested schema to make sure your selector is unique and also not fragile, meaning if we reload the page now or in the future, the selector should remain reliable. + 3/ Do not use Regex as much as possible. Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else. """ diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index dbf8a64d..47fe165b 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -6,6 +6,7 @@ import html import lxml import re import os +import subprocess import platform from .prompts import PROMPT_EXTRACT_BLOCKS from array import array @@ -2868,5 +2869,74 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre except Exception as e: # Fallback for parsing errors - return html_content[:max_size] if len(html_content) > max_size else html_content + return html_content[:max_size] if len(html_content) > max_size else html_content + +def start_colab_display_server(): + """ + Start virtual display server in Google Colab. + Raises error if not running in Colab environment. + """ + # Check if running in Google Colab + try: + import google.colab + from google.colab import output + from IPython.display import IFrame, display + except ImportError: + raise RuntimeError("This function must be run in Google Colab environment.") + import os, time, subprocess + + os.environ["DISPLAY"] = ":99" + + # Xvfb + xvfb = subprocess.Popen(["Xvfb", ":99", "-screen", "0", "1280x720x24"]) + time.sleep(2) + + # minimal window manager + fluxbox = subprocess.Popen(["fluxbox"]) + + # VNC โ†’ X + x11vnc = subprocess.Popen(["x11vnc", + "-display", ":99", + "-nopw", "-forever", "-shared", + "-rfbport", "5900", "-quiet"]) + + # websockify โ†’ VNC + novnc = subprocess.Popen(["/opt/novnc/utils/websockify/run", + "6080", "localhost:5900", + "--web", "/opt/novnc"]) + + time.sleep(2) # give ports a moment + + # Colab proxy url + url = output.eval_js("google.colab.kernel.proxyPort(6080)") + display(IFrame(f"{url}/vnc.html?autoconnect=true&resize=scale", width=1024, height=768)) + + + +def setup_colab_environment_ipython(): + """ + Alternative setup using IPython magic commands + """ + from IPython import get_ipython + ipython = get_ipython() + + print("๐Ÿš€ Setting up Crawl4AI environment in Google Colab...") + + # Run the bash commands + ipython.run_cell_magic('bash', '', ''' +set -e + +echo "๐Ÿ“ฆ Installing system dependencies..." +apt-get update -y +apt-get install -y xvfb x11vnc fluxbox websockify git + +echo "๐Ÿ“ฅ Setting up noVNC..." +git clone https://github.com/novnc/noVNC /opt/novnc +git clone https://github.com/novnc/websockify /opt/novnc/utils/websockify + +pip install -q nest_asyncio google-colab + +echo "โœ… Setup complete!" +''') + diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md index f2551c01..1b250126 100644 --- a/deploy/docker/c4ai-code-context.md +++ b/deploy/docker/c4ai-code-context.md @@ -1263,7 +1263,7 @@ class LLMConfig: provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, base_url: Optional[str] = None, - temprature: Optional[float] = None, + temperature: Optional[float] = None, max_tokens: Optional[int] = None, top_p: Optional[float] = None, frequency_penalty: Optional[float] = None, @@ -1291,7 +1291,7 @@ class LLMConfig: self.provider = DEFAULT_PROVIDER self.api_token = os.getenv(DEFAULT_PROVIDER_API_KEY) self.base_url = base_url - self.temprature = temprature + self.temperature = temperature self.max_tokens = max_tokens self.top_p = top_p self.frequency_penalty = frequency_penalty @@ -1305,7 +1305,7 @@ class LLMConfig: provider=kwargs.get("provider", DEFAULT_PROVIDER), api_token=kwargs.get("api_token"), base_url=kwargs.get("base_url"), - temprature=kwargs.get("temprature"), + temperature=kwargs.get("temperature"), max_tokens=kwargs.get("max_tokens"), top_p=kwargs.get("top_p"), frequency_penalty=kwargs.get("frequency_penalty"), @@ -1319,7 +1319,7 @@ class LLMConfig: "provider": self.provider, "api_token": self.api_token, "base_url": self.base_url, - "temprature": self.temprature, + "temperature": self.temperature, "max_tokens": self.max_tokens, "top_p": self.top_p, "frequency_penalty": self.frequency_penalty, @@ -4075,7 +4075,7 @@ class LLMExtractionStrategy(ExtractionStrategy): api_token: The API token for the provider. base_url: The base URL for the API request. api_base: The base URL for the API request. - extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc. + extra_args: Additional arguments for the API request, such as temperature, max_tokens, etc. """ super().__init__( input_format=input_format, **kwargs) self.llm_config = llm_config diff --git a/docs/apps/linkdin/c4ai_discover.py b/docs/apps/linkdin/c4ai_discover.py index f101ce0c..310b61b6 100644 --- a/docs/apps/linkdin/c4ai_discover.py +++ b/docs/apps/linkdin/c4ai_discover.py @@ -107,7 +107,14 @@ _COMPANY_SCHEMA_QUERY = dedent( IMPORTANT: Do not use the base64 kind of classes to target element. It's not reliable. The main div parent contains these li element is "div.search-results-container" you can use this. - The