diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 9892134f..b0b8a2aa 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -631,4 +631,9 @@ def wrap_text(draw, text, font, max_width): while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width: line += (words.pop(0) + ' ') lines.append(line) - return '\n'.join(lines) \ No newline at end of file + return '\n'.join(lines) + + +def format_html(html_string): + soup = BeautifulSoup(html_string, 'html.parser') + return soup.prettify() \ No newline at end of file diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 5dd4b9c0..13434e80 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -140,24 +140,28 @@ class WebCrawler: # Check cache first cached = None + screenshot_data = None extracted_content = None if not bypass_cache and not self.always_by_pass_cache: cached = get_cached_url(url) if cached: html = cached[1] - extracted_content = cached[2] + extracted_content = cached[4] if screenshot: - screenshot = cached[9] + screenshot_data = cached[9] + if not screenshot_data: + cached = None - else: + if not cached or not html: if user_agent: self.crawler_strategy.update_user_agent(user_agent) html = self.crawler_strategy.crawl(url) if screenshot: - screenshot = self.crawler_strategy.take_screenshot() + screenshot_data = self.crawler_strategy.take_screenshot() + - return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs) + return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) def process_html( self, @@ -197,7 +201,7 @@ class WebCrawler: sections = chunking_strategy.chunk(markdown) extracted_content = extraction_strategy.run(url, sections) - extracted_content = json.dumps(extracted_content) + extracted_content = json.dumps(extracted_content, indent=4, default=str) if verbose: print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.") @@ -217,11 +221,11 @@ class WebCrawler: json.dumps(metadata), screenshot=screenshot, ) - + return CrawlResult( url=url, html=html, - cleaned_html=cleaned_html, + cleaned_html=format_html(cleaned_html), markdown=markdown, media=media, links=links, diff --git a/docs/md/assets/styles.css b/docs/md/assets/styles.css index 51343a76..f4680880 100644 --- a/docs/md/assets/styles.css +++ b/docs/md/assets/styles.css @@ -15,7 +15,6 @@ --mono-font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono, Courier New, monospace, serif; - --background-color: #151515; /* Dark background */ --font-color: #eaeaea; /* Light font color for contrast */ --invert-font-color: #151515; /* Dark color for inverted elements */ @@ -73,11 +72,78 @@ pre, code { border-bottom: 1px dashed var(--secondary-color); } */ -.terminal-mkdocs-main-content{ +.terminal-mkdocs-main-content { line-height: var(--global-line-height); } -strong, .highlight { +strong, +.highlight { /* background: url(//s2.svgbox.net/pen-brushes.svg?ic=brush-1&color=50ffff); */ background-color: #50ffff33; +} + +.terminal-card > header { + color: var(--font-color); + text-align: center; + background-color: var(--progress-bar-background); + padding: 0.3em 0.5em; +} +.btn.btn-sm { + color: var(--font-color); + padding: 0.2em 0.5em; + font-size: 0.8em; +} + +.loading-message { + display: none; + margin-top: 20px; +} + +.response-section { + display: none; + padding-top: 20px; +} + +.tabs { + display: flex; + flex-direction: column; +} +.tab-list { + display: flex; + padding: 0; + margin: 0; + list-style-type: none; + border-bottom: 1px solid var(--font-color); +} +.tab-item { + cursor: pointer; + padding: 10px; + border: 1px solid var(--font-color); + margin-right: -1px; + border-bottom: none; +} +.tab-item:hover, +.tab-item:focus, +.tab-item:active { + background-color: var(--progress-bar-background); +} +.tab-content { + display: none; + border: 1px solid var(--font-color); + border-top: none; +} +.tab-content:first-of-type { + display: block; +} + +.tab-content header { + padding: 0.5em; + display: flex; + justify-content: end; + align-items: center; + background-color: var(--progress-bar-background); +} +.tab-content pre { + margin: 0; + max-height: 300px; overflow: auto; border:none; } \ No newline at end of file diff --git a/docs/md/demo.md b/docs/md/demo.md new file mode 100644 index 00000000..f97b658e --- /dev/null +++ b/docs/md/demo.md @@ -0,0 +1,198 @@ +# Interactive Demo for Crowler +
+