diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 9892134f..b0b8a2aa 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -631,4 +631,9 @@ def wrap_text(draw, text, font, max_width): while words and draw.textbbox((0, 0), line + words[0], font=font)[2] <= max_width: line += (words.pop(0) + ' ') lines.append(line) - return '\n'.join(lines) \ No newline at end of file + return '\n'.join(lines) + + +def format_html(html_string): + soup = BeautifulSoup(html_string, 'html.parser') + return soup.prettify() \ No newline at end of file diff --git a/crawl4ai/web_crawler.py b/crawl4ai/web_crawler.py index 5dd4b9c0..13434e80 100644 --- a/crawl4ai/web_crawler.py +++ b/crawl4ai/web_crawler.py @@ -140,24 +140,28 @@ class WebCrawler: # Check cache first cached = None + screenshot_data = None extracted_content = None if not bypass_cache and not self.always_by_pass_cache: cached = get_cached_url(url) if cached: html = cached[1] - extracted_content = cached[2] + extracted_content = cached[4] if screenshot: - screenshot = cached[9] + screenshot_data = cached[9] + if not screenshot_data: + cached = None - else: + if not cached or not html: if user_agent: self.crawler_strategy.update_user_agent(user_agent) html = self.crawler_strategy.crawl(url) if screenshot: - screenshot = self.crawler_strategy.take_screenshot() + screenshot_data = self.crawler_strategy.take_screenshot() + - return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs) + return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs) def process_html( self, @@ -197,7 +201,7 @@ class WebCrawler: sections = chunking_strategy.chunk(markdown) extracted_content = extraction_strategy.run(url, sections) - extracted_content = json.dumps(extracted_content) + extracted_content = json.dumps(extracted_content, indent=4, default=str) if verbose: print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.") @@ -217,11 +221,11 @@ class WebCrawler: json.dumps(metadata), screenshot=screenshot, ) - + return CrawlResult( url=url, html=html, - cleaned_html=cleaned_html, + cleaned_html=format_html(cleaned_html), markdown=markdown, media=media, links=links, diff --git a/docs/md/assets/styles.css b/docs/md/assets/styles.css index 51343a76..f4680880 100644 --- a/docs/md/assets/styles.css +++ b/docs/md/assets/styles.css @@ -15,7 +15,6 @@ --mono-font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono, Courier New, monospace, serif; - --background-color: #151515; /* Dark background */ --font-color: #eaeaea; /* Light font color for contrast */ --invert-font-color: #151515; /* Dark color for inverted elements */ @@ -73,11 +72,78 @@ pre, code { border-bottom: 1px dashed var(--secondary-color); } */ -.terminal-mkdocs-main-content{ +.terminal-mkdocs-main-content { line-height: var(--global-line-height); } -strong, .highlight { +strong, +.highlight { /* background: url(//s2.svgbox.net/pen-brushes.svg?ic=brush-1&color=50ffff); */ background-color: #50ffff33; +} + +.terminal-card > header { + color: var(--font-color); + text-align: center; + background-color: var(--progress-bar-background); + padding: 0.3em 0.5em; +} +.btn.btn-sm { + color: var(--font-color); + padding: 0.2em 0.5em; + font-size: 0.8em; +} + +.loading-message { + display: none; + margin-top: 20px; +} + +.response-section { + display: none; + padding-top: 20px; +} + +.tabs { + display: flex; + flex-direction: column; +} +.tab-list { + display: flex; + padding: 0; + margin: 0; + list-style-type: none; + border-bottom: 1px solid var(--font-color); +} +.tab-item { + cursor: pointer; + padding: 10px; + border: 1px solid var(--font-color); + margin-right: -1px; + border-bottom: none; +} +.tab-item:hover, +.tab-item:focus, +.tab-item:active { + background-color: var(--progress-bar-background); +} +.tab-content { + display: none; + border: 1px solid var(--font-color); + border-top: none; +} +.tab-content:first-of-type { + display: block; +} + +.tab-content header { + padding: 0.5em; + display: flex; + justify-content: end; + align-items: center; + background-color: var(--progress-bar-background); +} +.tab-content pre { + margin: 0; + max-height: 300px; overflow: auto; border:none; } \ No newline at end of file diff --git a/docs/md/demo.md b/docs/md/demo.md new file mode 100644 index 00000000..f97b658e --- /dev/null +++ b/docs/md/demo.md @@ -0,0 +1,198 @@ +# Interactive Demo for Crowler +
+
+
+ Enter URL and Options +
+ + +
+
+ + +
+
+ +
+
+
+ +
+
Loading... Please wait.
+
+ +
+

Response

+
+
    +
  • Markdown
  • +
  • Cleaned HTML
  • +
  • Media
  • +
  • Extracted Content
  • +
  • Screenshot
  • +
  • Python Code
  • +
+
+
+
+ + +
+
+
+
+ + + + + + + + + + +
+
+ + +
\ No newline at end of file diff --git a/docs/md/index.md b/docs/md/index.md index e5cda8a4..99a4f934 100644 --- a/docs/md/index.md +++ b/docs/md/index.md @@ -2,6 +2,11 @@ Welcome to the official documentation for Crawl4AI! 🕷️🤖 Crawl4AI is an open-source Python library designed to simplify web crawling and extract useful information from web pages. This documentation will guide you through the features, usage, and customization of Crawl4AI. + +## Try the [Demo](demo.md) + +Just try it now and crawl different pages to see how it works. You can set the links, see the structures of the output, and also view the Python sample code on how to run it. The old demo is available at [/old_demo](/old) where you can see more details. + ## Introduction Crawl4AI has one clear task: to make crawling and data extraction from web pages easy and efficient, especially for large language models (LLMs) and AI applications. Whether you are using it as a REST API or a Python library, Crawl4AI offers a robust and flexible solution. diff --git a/docs/md/interactive_content.html b/docs/md/interactive_content.html new file mode 100644 index 00000000..6043e2d9 --- /dev/null +++ b/docs/md/interactive_content.html @@ -0,0 +1,28 @@ +

Try Our Library

+
+ + + +
+
+ + diff --git a/main.py b/main.py index 184f5a6b..449893e3 100644 --- a/main.py +++ b/main.py @@ -63,6 +63,11 @@ class CrawlRequest(BaseModel): @app.get("/", response_class=HTMLResponse) +async def read_index(request: Request): + # redirect to site/index.html + return templates.TemplateResponse("index.html", {"request": request}) + +@app.get("/old", response_class=HTMLResponse) async def read_index(request: Request): partials_dir = os.path.join(__location__, "pages", "partial") partials = {} diff --git a/mkdocs.yml b/mkdocs.yml index c22936a8..673b7d9c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -2,9 +2,11 @@ site_name: Crawl4AI Documentation docs_dir: docs/md nav: - Home: index.md - - Introduction: introduction.md - - Installation: installation.md - - Quick Start: quickstart.md + - Demo: demo.md # Add this line + - First Steps: + - Introduction: introduction.md + - Installation: installation.md + - Quick Start: quickstart.md - Examples: - Intro: examples/index.md - LLM Extraction: examples/llm_extraction.md @@ -21,8 +23,9 @@ nav: - API Reference: - Core Classes and Functions: api/core_classes_and_functions.md - Detailed API Documentation: api/detailed_api_documentation.md - - Change Log: changelog.md - - Contact: contact.md + - Miscellaneous: + - Change Log: changelog.md + - Contact: contact.md theme: name: terminal @@ -36,4 +39,4 @@ extra_css: extra_javascript: - assets/highlight.min.js - - assets/highlight_init.js \ No newline at end of file + - assets/highlight_init.js