Compare commits

..

2 Commits

Author SHA1 Message Date
coderabbitai[bot]
32fcacafa6 📝 Add docstrings to codex/find-and-fix-a-bug
Docstrings generation was requested by @unclecode.

* https://github.com/unclecode/crawl4ai/pull/1122#issuecomment-2887985865

The following files were modified:

* `crawl4ai/utils.py`
2025-05-17 02:37:00 +00:00
UncleCode
45f1652d98 Fix merge_chunks splitter usage and remove incorrect return 2025-05-17 10:31:19 +08:00
13 changed files with 4 additions and 112 deletions

View File

@@ -764,9 +764,6 @@ class CrawlerRunConfig():
Default: 60000 (60 seconds). Default: 60000 (60 seconds).
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content. wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
Default: None. Default: None.
wait_for_timeout (int or None): Specific timeout in ms for the wait_for condition.
If None, uses page_timeout instead.
Default: None.
wait_for_images (bool): If True, wait for images to load before extracting content. wait_for_images (bool): If True, wait for images to load before extracting content.
Default: False. Default: False.
delay_before_return_html (float): Delay in seconds before retrieving final HTML. delay_before_return_html (float): Delay in seconds before retrieving final HTML.
@@ -907,7 +904,6 @@ class CrawlerRunConfig():
wait_until: str = "domcontentloaded", wait_until: str = "domcontentloaded",
page_timeout: int = PAGE_TIMEOUT, page_timeout: int = PAGE_TIMEOUT,
wait_for: str = None, wait_for: str = None,
wait_for_timeout: int = None,
wait_for_images: bool = False, wait_for_images: bool = False,
delay_before_return_html: float = 0.1, delay_before_return_html: float = 0.1,
mean_delay: float = 0.1, mean_delay: float = 0.1,
@@ -1004,7 +1000,6 @@ class CrawlerRunConfig():
self.wait_until = wait_until self.wait_until = wait_until
self.page_timeout = page_timeout self.page_timeout = page_timeout
self.wait_for = wait_for self.wait_for = wait_for
self.wait_for_timeout = wait_for_timeout
self.wait_for_images = wait_for_images self.wait_for_images = wait_for_images
self.delay_before_return_html = delay_before_return_html self.delay_before_return_html = delay_before_return_html
self.mean_delay = mean_delay self.mean_delay = mean_delay
@@ -1146,7 +1141,6 @@ class CrawlerRunConfig():
wait_until=kwargs.get("wait_until", "domcontentloaded"), wait_until=kwargs.get("wait_until", "domcontentloaded"),
page_timeout=kwargs.get("page_timeout", 60000), page_timeout=kwargs.get("page_timeout", 60000),
wait_for=kwargs.get("wait_for"), wait_for=kwargs.get("wait_for"),
wait_for_timeout=kwargs.get("wait_for_timeout"),
wait_for_images=kwargs.get("wait_for_images", False), wait_for_images=kwargs.get("wait_for_images", False),
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1), delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
mean_delay=kwargs.get("mean_delay", 0.1), mean_delay=kwargs.get("mean_delay", 0.1),
@@ -1256,7 +1250,6 @@ class CrawlerRunConfig():
"wait_until": self.wait_until, "wait_until": self.wait_until,
"page_timeout": self.page_timeout, "page_timeout": self.page_timeout,
"wait_for": self.wait_for, "wait_for": self.wait_for,
"wait_for_timeout": self.wait_for_timeout,
"wait_for_images": self.wait_for_images, "wait_for_images": self.wait_for_images,
"delay_before_return_html": self.delay_before_return_html, "delay_before_return_html": self.delay_before_return_html,
"mean_delay": self.mean_delay, "mean_delay": self.mean_delay,

View File

@@ -937,10 +937,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.wait_for: if config.wait_for:
try: try:
# Use wait_for_timeout if specified, otherwise fall back to page_timeout
timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
await self.smart_wait( await self.smart_wait(
page, config.wait_for, timeout=timeout page, config.wait_for, timeout=config.page_timeout
) )
except Exception as e: except Exception as e:
raise RuntimeError(f"Wait condition failed: {str(e)}") raise RuntimeError(f"Wait condition failed: {str(e)}")

View File

@@ -964,10 +964,7 @@ class BrowserManager:
pages = context.pages pages = context.pages
page = next((p for p in pages if p.url == crawlerRunConfig.url), None) page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
if not page: if not page:
if pages: page = context.pages[0] # await context.new_page()
page = context.pages[0]
else:
page = await context.new_page()
else: else:
# Otherwise, check if we have an existing context for this config # Otherwise, check if we have an existing context for this config
config_signature = self._make_config_signature(crawlerRunConfig) config_signature = self._make_config_signature(crawlerRunConfig)

View File

@@ -235,7 +235,6 @@ async def crawl_people_page(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,
magic=True, magic=True,
wait_for=".org-people-profile-card__card-spacing", wait_for=".org-people-profile-card__card-spacing",
wait_for_images=5000,
delay_before_return_html=1, delay_before_return_html=1,
session_id="people_search", session_id="people_search",
) )
@@ -421,9 +420,8 @@ def main():
cli_opts = parser.parse_args() cli_opts = parser.parse_args()
# decide on debug defaults # decide on debug defaults
if cli_opts.debug or True: if cli_opts.debug:
opts = detect_debug_defaults(force=True) opts = detect_debug_defaults(force=True)
cli_opts = opts
else: else:
env_defaults = detect_debug_defaults() env_defaults = detect_debug_defaults()
opts = env_defaults if env_defaults else cli_opts opts = env_defaults if env_defaults else cli_opts

View File

@@ -1,37 +0,0 @@
/* docs/assets/feedback-overrides.css */
:root {
/* brand */
--feedback-primary-color: #09b5a5;
--feedback-highlight-color: #fed500; /* stars etc */
/* modal shell / text */
--feedback-modal-content-bg-color: var(--background-color);
--feedback-modal-content-text-color: var(--font-color);
--feedback-modal-content-border-color: var(--primary-dimmed-color);
--feedback-modal-content-border-radius: 4px;
/* overlay */
--feedback-overlay-bg-color: rgba(0,0,0,.75);
/* rating buttons */
--feedback-modal-rating-button-color: var(--secondary-color);
--feedback-modal-rating-button-selected-color: var(--primary-color);
/* inputs */
--feedback-modal-input-bg-color: var(--code-bg-color);
--feedback-modal-input-text-color: var(--font-color);
--feedback-modal-input-border-color: var(--primary-dimmed-color);
--feedback-modal-input-border-color-focused: var(--primary-color);
/* submit / secondary buttons */
--feedback-modal-button-submit-bg-color: var(--primary-color);
--feedback-modal-button-submit-bg-color-hover: var(--primary-dimmed-color);
--feedback-modal-button-submit-text-color: var(--invert-font-color);
--feedback-modal-button-bg-color: transparent; /* screenshot btn */
--feedback-modal-button-border-color: var(--primary-color);
--feedback-modal-button-icon-color: var(--primary-color);
}
/* optional: keep the “Powered by” link subtle */
.feedback-logo a{color:var(--secondary-color);}

View File

@@ -1,5 +0,0 @@
window.dataLayer = window.dataLayer || [];
function gtag(){dataLayer.push(arguments);}
gtag('js', new Date());
gtag('config', 'G-58W0K2ZQ25');

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.4 KiB

View File

@@ -1,47 +0,0 @@
{% set extra_html_attrs = 'data-theme="dark"' %}
{% extends "base.html" %}
{% block extrahead %}
{{ super() }}
<script>
document.documentElement.setAttribute("data-theme", "dark");
</script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pushfeedback/dist/pushfeedback/pushfeedback.css">
<style>
:root {
/* brand */
--feedback-primary-color: #09b5a5;
--feedback-highlight-color: #fed500;
/* align with the value you really use in :root */
--header-height: 65px;
/* Push modal content down */
--feedback-modal-content-position-top: var(--header-height);
--feedback-modal-modal-wrapper-z-index: 1100;
/* > headers 1000 */
--feedback-modal-content-z-index: 1101;
}
feedback-modal::part(overlay) {
top: var(--header-height);
/* start below header */
height: calc(100vh - var(--header-height));
/* fill the rest */
}
</style>
<script type="module"
src="https://cdn.jsdelivr.net/npm/pushfeedback@latest/dist/pushfeedback/pushfeedback.esm.js"></script>
{% endblock %}
{% block footer %}
<feedback-button project="w8plzp8vjp" button-style="dark" button-position="center-right" modal-position="sidebar-right">
>
Feedback
</feedback-button>
{% endblock %}

View File

@@ -1,5 +1,4 @@
site_name: Crawl4AI Documentation (v0.6.x) site_name: Crawl4AI Documentation (v0.6.x)
site_favicon: docs/md_v2/favicon.ico
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
site_url: https://docs.crawl4ai.com site_url: https://docs.crawl4ai.com
repo_url: https://github.com/unclecode/crawl4ai repo_url: https://github.com/unclecode/crawl4ai
@@ -58,8 +57,6 @@ nav:
theme: theme:
name: 'terminal' name: 'terminal'
palette: 'dark' palette: 'dark'
custom_dir: docs/md_v2/overrides
color_mode: 'dark'
icon: icon:
repo: fontawesome/brands/github repo: fontawesome/brands/github
@@ -85,11 +82,8 @@ extra_css:
- assets/styles.css - assets/styles.css
- assets/highlight.css - assets/highlight.css
- assets/dmvendor.css - assets/dmvendor.css
- assets/feedback-overrides.css
extra_javascript: extra_javascript:
- https://www.googletagmanager.com/gtag/js?id=G-58W0K2ZQ25
- assets/gtag.js
- assets/highlight.min.js - assets/highlight.min.js
- assets/highlight_init.js - assets/highlight_init.js
- https://buttons.github.io/buttons.js - https://buttons.github.io/buttons.js

View File

@@ -26,6 +26,7 @@ dependencies = [
"xxhash~=3.4", "xxhash~=3.4",
"rank-bm25~=0.2", "rank-bm25~=0.2",
"aiofiles>=24.1.0", "aiofiles>=24.1.0",
"colorama~=0.4",
"snowballstemmer~=2.2", "snowballstemmer~=2.2",
"pydantic>=2.10", "pydantic>=2.10",
"pyOpenSSL>=24.3.0", "pyOpenSSL>=24.3.0",