Compare commits
2 Commits
codex/fix-
...
coderabbit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
32fcacafa6 | ||
|
|
45f1652d98 |
@@ -764,9 +764,6 @@ class CrawlerRunConfig():
|
|||||||
Default: 60000 (60 seconds).
|
Default: 60000 (60 seconds).
|
||||||
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
wait_for (str or None): A CSS selector or JS condition to wait for before extracting content.
|
||||||
Default: None.
|
Default: None.
|
||||||
wait_for_timeout (int or None): Specific timeout in ms for the wait_for condition.
|
|
||||||
If None, uses page_timeout instead.
|
|
||||||
Default: None.
|
|
||||||
wait_for_images (bool): If True, wait for images to load before extracting content.
|
wait_for_images (bool): If True, wait for images to load before extracting content.
|
||||||
Default: False.
|
Default: False.
|
||||||
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
delay_before_return_html (float): Delay in seconds before retrieving final HTML.
|
||||||
@@ -907,7 +904,6 @@ class CrawlerRunConfig():
|
|||||||
wait_until: str = "domcontentloaded",
|
wait_until: str = "domcontentloaded",
|
||||||
page_timeout: int = PAGE_TIMEOUT,
|
page_timeout: int = PAGE_TIMEOUT,
|
||||||
wait_for: str = None,
|
wait_for: str = None,
|
||||||
wait_for_timeout: int = None,
|
|
||||||
wait_for_images: bool = False,
|
wait_for_images: bool = False,
|
||||||
delay_before_return_html: float = 0.1,
|
delay_before_return_html: float = 0.1,
|
||||||
mean_delay: float = 0.1,
|
mean_delay: float = 0.1,
|
||||||
@@ -1004,7 +1000,6 @@ class CrawlerRunConfig():
|
|||||||
self.wait_until = wait_until
|
self.wait_until = wait_until
|
||||||
self.page_timeout = page_timeout
|
self.page_timeout = page_timeout
|
||||||
self.wait_for = wait_for
|
self.wait_for = wait_for
|
||||||
self.wait_for_timeout = wait_for_timeout
|
|
||||||
self.wait_for_images = wait_for_images
|
self.wait_for_images = wait_for_images
|
||||||
self.delay_before_return_html = delay_before_return_html
|
self.delay_before_return_html = delay_before_return_html
|
||||||
self.mean_delay = mean_delay
|
self.mean_delay = mean_delay
|
||||||
@@ -1146,7 +1141,6 @@ class CrawlerRunConfig():
|
|||||||
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
wait_until=kwargs.get("wait_until", "domcontentloaded"),
|
||||||
page_timeout=kwargs.get("page_timeout", 60000),
|
page_timeout=kwargs.get("page_timeout", 60000),
|
||||||
wait_for=kwargs.get("wait_for"),
|
wait_for=kwargs.get("wait_for"),
|
||||||
wait_for_timeout=kwargs.get("wait_for_timeout"),
|
|
||||||
wait_for_images=kwargs.get("wait_for_images", False),
|
wait_for_images=kwargs.get("wait_for_images", False),
|
||||||
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
delay_before_return_html=kwargs.get("delay_before_return_html", 0.1),
|
||||||
mean_delay=kwargs.get("mean_delay", 0.1),
|
mean_delay=kwargs.get("mean_delay", 0.1),
|
||||||
@@ -1256,7 +1250,6 @@ class CrawlerRunConfig():
|
|||||||
"wait_until": self.wait_until,
|
"wait_until": self.wait_until,
|
||||||
"page_timeout": self.page_timeout,
|
"page_timeout": self.page_timeout,
|
||||||
"wait_for": self.wait_for,
|
"wait_for": self.wait_for,
|
||||||
"wait_for_timeout": self.wait_for_timeout,
|
|
||||||
"wait_for_images": self.wait_for_images,
|
"wait_for_images": self.wait_for_images,
|
||||||
"delay_before_return_html": self.delay_before_return_html,
|
"delay_before_return_html": self.delay_before_return_html,
|
||||||
"mean_delay": self.mean_delay,
|
"mean_delay": self.mean_delay,
|
||||||
|
|||||||
@@ -937,10 +937,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
if config.wait_for:
|
if config.wait_for:
|
||||||
try:
|
try:
|
||||||
# Use wait_for_timeout if specified, otherwise fall back to page_timeout
|
|
||||||
timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout
|
|
||||||
await self.smart_wait(
|
await self.smart_wait(
|
||||||
page, config.wait_for, timeout=timeout
|
page, config.wait_for, timeout=config.page_timeout
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
raise RuntimeError(f"Wait condition failed: {str(e)}")
|
||||||
|
|||||||
@@ -964,10 +964,7 @@ class BrowserManager:
|
|||||||
pages = context.pages
|
pages = context.pages
|
||||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||||
if not page:
|
if not page:
|
||||||
if pages:
|
page = context.pages[0] # await context.new_page()
|
||||||
page = context.pages[0]
|
|
||||||
else:
|
|
||||||
page = await context.new_page()
|
|
||||||
else:
|
else:
|
||||||
# Otherwise, check if we have an existing context for this config
|
# Otherwise, check if we have an existing context for this config
|
||||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||||
|
|||||||
@@ -235,7 +235,6 @@ async def crawl_people_page(
|
|||||||
cache_mode=CacheMode.BYPASS,
|
cache_mode=CacheMode.BYPASS,
|
||||||
magic=True,
|
magic=True,
|
||||||
wait_for=".org-people-profile-card__card-spacing",
|
wait_for=".org-people-profile-card__card-spacing",
|
||||||
wait_for_images=5000,
|
|
||||||
delay_before_return_html=1,
|
delay_before_return_html=1,
|
||||||
session_id="people_search",
|
session_id="people_search",
|
||||||
)
|
)
|
||||||
@@ -421,9 +420,8 @@ def main():
|
|||||||
cli_opts = parser.parse_args()
|
cli_opts = parser.parse_args()
|
||||||
|
|
||||||
# decide on debug defaults
|
# decide on debug defaults
|
||||||
if cli_opts.debug or True:
|
if cli_opts.debug:
|
||||||
opts = detect_debug_defaults(force=True)
|
opts = detect_debug_defaults(force=True)
|
||||||
cli_opts = opts
|
|
||||||
else:
|
else:
|
||||||
env_defaults = detect_debug_defaults()
|
env_defaults = detect_debug_defaults()
|
||||||
opts = env_defaults if env_defaults else cli_opts
|
opts = env_defaults if env_defaults else cli_opts
|
||||||
|
|||||||
@@ -1,37 +0,0 @@
|
|||||||
/* docs/assets/feedback-overrides.css */
|
|
||||||
:root {
|
|
||||||
/* brand */
|
|
||||||
--feedback-primary-color: #09b5a5;
|
|
||||||
--feedback-highlight-color: #fed500; /* stars etc */
|
|
||||||
|
|
||||||
/* modal shell / text */
|
|
||||||
--feedback-modal-content-bg-color: var(--background-color);
|
|
||||||
--feedback-modal-content-text-color: var(--font-color);
|
|
||||||
--feedback-modal-content-border-color: var(--primary-dimmed-color);
|
|
||||||
--feedback-modal-content-border-radius: 4px;
|
|
||||||
|
|
||||||
/* overlay */
|
|
||||||
--feedback-overlay-bg-color: rgba(0,0,0,.75);
|
|
||||||
|
|
||||||
/* rating buttons */
|
|
||||||
--feedback-modal-rating-button-color: var(--secondary-color);
|
|
||||||
--feedback-modal-rating-button-selected-color: var(--primary-color);
|
|
||||||
|
|
||||||
/* inputs */
|
|
||||||
--feedback-modal-input-bg-color: var(--code-bg-color);
|
|
||||||
--feedback-modal-input-text-color: var(--font-color);
|
|
||||||
--feedback-modal-input-border-color: var(--primary-dimmed-color);
|
|
||||||
--feedback-modal-input-border-color-focused: var(--primary-color);
|
|
||||||
|
|
||||||
/* submit / secondary buttons */
|
|
||||||
--feedback-modal-button-submit-bg-color: var(--primary-color);
|
|
||||||
--feedback-modal-button-submit-bg-color-hover: var(--primary-dimmed-color);
|
|
||||||
--feedback-modal-button-submit-text-color: var(--invert-font-color);
|
|
||||||
|
|
||||||
--feedback-modal-button-bg-color: transparent; /* screenshot btn */
|
|
||||||
--feedback-modal-button-border-color: var(--primary-color);
|
|
||||||
--feedback-modal-button-icon-color: var(--primary-color);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* optional: keep the “Powered by” link subtle */
|
|
||||||
.feedback-logo a{color:var(--secondary-color);}
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
window.dataLayer = window.dataLayer || [];
|
|
||||||
function gtag(){dataLayer.push(arguments);}
|
|
||||||
gtag('js', new Date());
|
|
||||||
|
|
||||||
gtag('config', 'G-58W0K2ZQ25');
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 3.4 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 1.6 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 1.4 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 3.4 KiB |
@@ -1,47 +0,0 @@
|
|||||||
{% set extra_html_attrs = 'data-theme="dark"' %}
|
|
||||||
{% extends "base.html" %}
|
|
||||||
|
|
||||||
{% block extrahead %}
|
|
||||||
{{ super() }}
|
|
||||||
<script>
|
|
||||||
document.documentElement.setAttribute("data-theme", "dark");
|
|
||||||
</script>
|
|
||||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/pushfeedback/dist/pushfeedback/pushfeedback.css">
|
|
||||||
|
|
||||||
<style>
|
|
||||||
:root {
|
|
||||||
/* brand */
|
|
||||||
--feedback-primary-color: #09b5a5;
|
|
||||||
--feedback-highlight-color: #fed500;
|
|
||||||
|
|
||||||
|
|
||||||
/* align with the value you really use in :root */
|
|
||||||
--header-height: 65px;
|
|
||||||
|
|
||||||
/* Push modal content down */
|
|
||||||
--feedback-modal-content-position-top: var(--header-height);
|
|
||||||
|
|
||||||
--feedback-modal-modal-wrapper-z-index: 1100;
|
|
||||||
/* > header’s 1000 */
|
|
||||||
--feedback-modal-content-z-index: 1101;
|
|
||||||
}
|
|
||||||
|
|
||||||
feedback-modal::part(overlay) {
|
|
||||||
top: var(--header-height);
|
|
||||||
/* start below header */
|
|
||||||
height: calc(100vh - var(--header-height));
|
|
||||||
/* fill the rest */
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
</style>
|
|
||||||
<script type="module"
|
|
||||||
src="https://cdn.jsdelivr.net/npm/pushfeedback@latest/dist/pushfeedback/pushfeedback.esm.js"></script>
|
|
||||||
{% endblock %}
|
|
||||||
|
|
||||||
{% block footer %}
|
|
||||||
<feedback-button project="w8plzp8vjp" button-style="dark" button-position="center-right" modal-position="sidebar-right">
|
|
||||||
>
|
|
||||||
Feedback
|
|
||||||
</feedback-button>
|
|
||||||
{% endblock %}
|
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
site_name: Crawl4AI Documentation (v0.6.x)
|
site_name: Crawl4AI Documentation (v0.6.x)
|
||||||
site_favicon: docs/md_v2/favicon.ico
|
|
||||||
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
|
||||||
site_url: https://docs.crawl4ai.com
|
site_url: https://docs.crawl4ai.com
|
||||||
repo_url: https://github.com/unclecode/crawl4ai
|
repo_url: https://github.com/unclecode/crawl4ai
|
||||||
@@ -58,8 +57,6 @@ nav:
|
|||||||
theme:
|
theme:
|
||||||
name: 'terminal'
|
name: 'terminal'
|
||||||
palette: 'dark'
|
palette: 'dark'
|
||||||
custom_dir: docs/md_v2/overrides
|
|
||||||
color_mode: 'dark'
|
|
||||||
icon:
|
icon:
|
||||||
repo: fontawesome/brands/github
|
repo: fontawesome/brands/github
|
||||||
|
|
||||||
@@ -85,11 +82,8 @@ extra_css:
|
|||||||
- assets/styles.css
|
- assets/styles.css
|
||||||
- assets/highlight.css
|
- assets/highlight.css
|
||||||
- assets/dmvendor.css
|
- assets/dmvendor.css
|
||||||
- assets/feedback-overrides.css
|
|
||||||
|
|
||||||
extra_javascript:
|
extra_javascript:
|
||||||
- https://www.googletagmanager.com/gtag/js?id=G-58W0K2ZQ25
|
|
||||||
- assets/gtag.js
|
|
||||||
- assets/highlight.min.js
|
- assets/highlight.min.js
|
||||||
- assets/highlight_init.js
|
- assets/highlight_init.js
|
||||||
- https://buttons.github.io/buttons.js
|
- https://buttons.github.io/buttons.js
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ dependencies = [
|
|||||||
"xxhash~=3.4",
|
"xxhash~=3.4",
|
||||||
"rank-bm25~=0.2",
|
"rank-bm25~=0.2",
|
||||||
"aiofiles>=24.1.0",
|
"aiofiles>=24.1.0",
|
||||||
|
"colorama~=0.4",
|
||||||
"snowballstemmer~=2.2",
|
"snowballstemmer~=2.2",
|
||||||
"pydantic>=2.10",
|
"pydantic>=2.10",
|
||||||
"pyOpenSSL>=24.3.0",
|
"pyOpenSSL>=24.3.0",
|
||||||
|
|||||||
Reference in New Issue
Block a user