# Examples Outline for crawl4ai - markdown Component **Target Document Type:** Examples Collection **Target Output Filename Suggestion:** `llm_examples_markdown.md` **Library Version Context:** 0.6.3 **Outline Generation Date:** 2025-05-24 --- This document provides practical, runnable code examples for the `markdown` component of the `crawl4ai` library, focusing on the `DefaultMarkdownGenerator` and its various configurations. ## 1. Basic Markdown Generation with `DefaultMarkdownGenerator` ### 1.1. Example: Generating Markdown with default `DefaultMarkdownGenerator` settings via `AsyncWebCrawler`. This example demonstrates the most basic usage of `DefaultMarkdownGenerator` within an `AsyncWebCrawler` run. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode async def basic_markdown_generation_via_crawler(): # DefaultMarkdownGenerator will be used by default if markdown_generator is not specified, # but we explicitly set it here for clarity. md_generator = DefaultMarkdownGenerator() config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS # Use BYPASS for fresh content in examples ) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://example.com", config=config) if result.success and result.markdown: print("--- Raw Markdown (First 300 chars) ---") print(result.markdown.raw_markdown[:300]) print("\n--- Markdown with Citations (First 300 chars) ---") print(result.markdown.markdown_with_citations[:300]) print("\n--- References Markdown ---") print(result.markdown.references_markdown) # example.com has no outbound links usually else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(basic_markdown_generation_via_crawler()) ``` --- ### 1.2. Example: Direct instantiation and use of `DefaultMarkdownGenerator`. You can use `DefaultMarkdownGenerator` directly if you already have HTML content. ```python from crawl4ai import DefaultMarkdownGenerator def direct_markdown_generation(): generator = DefaultMarkdownGenerator() html_content = """ Test Page

Welcome to Example

This is a paragraph with a link.

Another paragraph follows.

""" # base_url is important for resolving relative links if any, and for citation context result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com") print("--- Raw Markdown (Direct Generation) ---") print(result_md.raw_markdown) print("\n--- Markdown with Citations (Direct Generation) ---") print(result_md.markdown_with_citations) print("\n--- References Markdown (Direct Generation) ---") print(result_md.references_markdown) if __name__ == "__main__": direct_markdown_generation() ``` --- ## 2. Citation Management in Markdown ### 2.1. Example: Default citation behavior (citations enabled). By default, `DefaultMarkdownGenerator` generates citations for links. ```python from crawl4ai import DefaultMarkdownGenerator def default_citation_behavior(): generator = DefaultMarkdownGenerator() html_content = """

Check out Crawl4ai and our documentation.

""" result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com") print("--- Raw Markdown ---") print(result_md.raw_markdown) print("\n--- Markdown with Citations ---") print(result_md.markdown_with_citations) print("\n--- References Markdown ---") print(result_md.references_markdown) if __name__ == "__main__": default_citation_behavior() ``` --- ### 2.2. Example: Disabling citations in `DefaultMarkdownGenerator`. You can disable citation generation by setting `citations=False` in the `generate_markdown` method. ```python from crawl4ai import DefaultMarkdownGenerator def disabling_citations(): generator = DefaultMarkdownGenerator() html_content = """

A link to another site will not be cited.

""" # Disable citations for this specific call result_md_no_citations = generator.generate_markdown( input_html=html_content, base_url="https://example.com", citations=False ) print("--- Raw Markdown (Citations Disabled) ---") print(result_md_no_citations.raw_markdown) print("\n--- Markdown with Citations (Citations Disabled) ---") # This should be the same as raw_markdown when citations=False print(result_md_no_citations.markdown_with_citations) print("\n--- References Markdown (Citations Disabled) ---") # This should be empty or minimal print(result_md_no_citations.references_markdown) # For comparison, with citations enabled (default) result_md_with_citations = generator.generate_markdown( input_html=html_content, base_url="https://example.com", citations=True # Default ) print("\n--- For Comparison: Markdown with Citations (Enabled) ---") print(result_md_with_citations.markdown_with_citations) print("\n--- For Comparison: References Markdown (Enabled) ---") print(result_md_with_citations.references_markdown) if __name__ == "__main__": disabling_citations() ``` --- ### 2.3. Example: Impact of `base_url` on citation links for relative URLs. The `base_url` parameter is crucial for correctly resolving relative URLs in your HTML content into absolute URLs in the references. ```python from crawl4ai import DefaultMarkdownGenerator def base_url_impact_on_citations(): generator = DefaultMarkdownGenerator() html_content = """

Links: Features, Pricing, and an absolute link to External Resource.

""" print("--- Case 1: With base_url='https://example.com/products/' ---") result_md_case1 = generator.generate_markdown( input_html=html_content, base_url="https://example.com/products/" ) print(result_md_case1.references_markdown) print("\n--- Case 2: With base_url='https://another-domain.net/' ---") result_md_case2 = generator.generate_markdown( input_html=html_content, base_url="https://another-domain.net/" ) print(result_md_case2.references_markdown) print("\n--- Case 3: Without base_url (relative links might be incomplete) ---") result_md_case3 = generator.generate_markdown(input_html=html_content) print(result_md_case3.references_markdown) if __name__ == "__main__": base_url_impact_on_citations() ``` --- ### 2.4. Example: Handling HTML with no links (empty `references_markdown`). If the input HTML contains no hyperlinks, the `references_markdown` will be empty. ```python from crawl4ai import DefaultMarkdownGenerator def no_links_in_html(): generator = DefaultMarkdownGenerator() html_content = "

This is a paragraph with no links at all.

Just some bold text." result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com") print("--- Raw Markdown ---") print(result_md.raw_markdown) print("\n--- Markdown with Citations ---") print(result_md.markdown_with_citations) # Should be same as raw_markdown print("\n--- References Markdown ---") print(f"'{result_md.references_markdown}'") # Should be empty or contain minimal boilerplate if __name__ == "__main__": no_links_in_html() ``` --- ## 3. Controlling `html2text` Conversion Options The `DefaultMarkdownGenerator` uses the `html2text` library internally. You can pass options to `html2text` either during generator initialization (`options` parameter) or during the `generate_markdown` call (`html2text_options` parameter). ### 3.1. Example: Initializing `DefaultMarkdownGenerator` with `options` to ignore links. This will prevent links from appearing in the Markdown output altogether (different from `citations=False` which keeps link text but omits citation markers). ```python from crawl4ai import DefaultMarkdownGenerator def ignore_links_option(): # Initialize with html2text option to ignore links generator = DefaultMarkdownGenerator(options={"ignore_links": True}) html_content = "

A link to Example Site and some text.

" result_md = generator.generate_markdown(input_html=html_content) print("--- Markdown (ignore_links=True) ---") print(result_md.raw_markdown) # Link text might be present or absent based on html2text behavior print("--- Markdown with Citations (ignore_links=True) ---") print(result_md.markdown_with_citations) # No citations as links are ignored print("--- References (ignore_links=True) ---") print(f"'{result_md.references_markdown}'") # Should be empty if __name__ == "__main__": ignore_links_option() ``` --- ### 3.2. Example: Initializing `DefaultMarkdownGenerator` with `options` to ignore images. This will prevent image references (like `![alt text](src)`) from appearing in the Markdown. ```python from crawl4ai import DefaultMarkdownGenerator def ignore_images_option(): generator = DefaultMarkdownGenerator(options={"ignore_images": True}) html_content = "

An image: My Test Image

" result_md = generator.generate_markdown(input_html=html_content) print("--- Markdown (ignore_images=True) ---") print(result_md.raw_markdown) # Image markdown should be absent if __name__ == "__main__": ignore_images_option() ``` --- ### 3.3. Example: Initializing `DefaultMarkdownGenerator` with `options` for `body_width=0` (no line wrapping). `body_width=0` tells `html2text` not to wrap lines. ```python from crawl4ai import DefaultMarkdownGenerator def no_line_wrapping_option(): generator = DefaultMarkdownGenerator(options={"body_width": 0}) long_text = "This is a very long line of text that would normally be wrapped by html2text. " * 5 html_content = f"

{long_text}

" result_md = generator.generate_markdown(input_html=html_content) print("--- Markdown (body_width=0) ---") print(result_md.raw_markdown) # Observe the long line without soft wraps if __name__ == "__main__": no_line_wrapping_option() ``` --- ### 3.4. Example: Initializing `DefaultMarkdownGenerator` to disable emphasis. This will remove formatting for `` and `` tags. ```python from crawl4ai import DefaultMarkdownGenerator def ignore_emphasis_option(): generator = DefaultMarkdownGenerator(options={"ignore_emphasis": True}) html_content = "

Normal, emphasized, and strongly emphasized text.

" result_md = generator.generate_markdown(input_html=html_content) print("--- Markdown (ignore_emphasis=True) ---") print(result_md.raw_markdown) # Emphasis should be gone if __name__ == "__main__": ignore_emphasis_option() ``` --- ### 3.5. Example: Overriding `html2text_options` at `generate_markdown` call time. Options passed to `generate_markdown` via `html2text_options` take precedence. ```python from crawl4ai import DefaultMarkdownGenerator def override_html2text_options(): # Initial generator might have some defaults generator = DefaultMarkdownGenerator(options={"ignore_links": False}) html_content = "

Link: Example.

" # Override at call time to protect links result_md = generator.generate_markdown( input_html=html_content, html2text_options={"protect_links": True} # Links will be ) print("--- Markdown (protect_links=True via call-time override) ---") print(result_md.raw_markdown) if __name__ == "__main__": override_html2text_options() ``` --- ### 3.6. Example: Combining multiple `html2text` options. Multiple options can be combined for fine-grained control over the Markdown output. ```python from crawl4ai import DefaultMarkdownGenerator def combined_html2text_options(): generator = DefaultMarkdownGenerator(options={ "ignore_links": True, "ignore_images": True, "body_width": 60 # Wrap at 60 characters }) html_content = """

This is a paragraph with a link to ignore and an image to ignore. It also has some long text to demonstrate wrapping. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.

""" result_md = generator.generate_markdown(input_html=html_content) print("--- Markdown (Combined Options: ignore_links, ignore_images, body_width=60) ---") print(result_md.raw_markdown) if __name__ == "__main__": combined_html2text_options() ``` --- ## 4. Selecting the HTML Content Source for Markdown Generation The `DefaultMarkdownGenerator` can generate Markdown from different HTML sources within the `CrawlResult`. ### 4.1. Example: Markdown from `cleaned_html` (default `content_source`). This is the default behavior. `cleaned_html` is the HTML after `WebScrapingStrategy` (e.g., `LXMLWebScrapingStrategy`) has processed it. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode async def markdown_from_cleaned_html(): # Default content_source is "cleaned_html" md_generator = DefaultMarkdownGenerator() config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: # Using a more complex page to see the effect of cleaning result = await crawler.arun(url="https://news.ycombinator.com", config=config) if result.success and result.markdown: print("--- Markdown from Cleaned HTML (Default - First 300 chars) ---") print(result.markdown.raw_markdown[:300]) # For comparison, show a snippet of cleaned_html print("\n--- Cleaned HTML (Source - First 300 chars) ---") print(result.cleaned_html[:300]) else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(markdown_from_cleaned_html()) ``` --- ### 4.2. Example: Markdown from `raw_html`. This example uses the original, unprocessed HTML fetched from the URL as the source for Markdown generation. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode async def markdown_from_raw_html(): md_generator = DefaultMarkdownGenerator(content_source="raw_html") config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://example.com", config=config) if result.success and result.markdown: print("--- Markdown from Raw HTML (First 300 chars) ---") print(result.markdown.raw_markdown[:300]) print("\n--- Raw Page HTML (Source - First 300 chars for comparison) ---") print(result.html[:300]) # result.html contains the raw HTML else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(markdown_from_raw_html()) ``` --- ### 4.3. Example: Markdown from `fit_html` (requires a `ContentFilterStrategy`). `fit_html` is the HTML content after a `ContentFilterStrategy` (like `PruningContentFilter`) has processed it. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode from crawl4ai.content_filter_strategy import PruningContentFilter async def markdown_from_fit_html(): # A content filter must run to produce fit_html pruning_filter = PruningContentFilter() md_generator = DefaultMarkdownGenerator( content_filter=pruning_filter, content_source="fit_html" # Explicitly use the output of the filter ) config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: # Using a news site which PruningContentFilter can work on result = await crawler.arun(url="https://news.ycombinator.com", config=config) if result.success and result.markdown: print("--- Markdown from Fit HTML (Output of PruningFilter - First 300 chars) ---") # When content_source="fit_html", result.markdown.raw_markdown IS from fit_html print(result.markdown.raw_markdown[:300]) print("\n--- Fit HTML itself (Source - First 300 chars for comparison) ---") print(result.markdown.fit_html[:300]) else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(markdown_from_fit_html()) ``` --- ## 5. Integration with Content Filters `DefaultMarkdownGenerator` can work in conjunction with `ContentFilterStrategy` instances. If a filter is provided, it will produce `fit_html` and `fit_markdown`. ### 5.1. Example: `DefaultMarkdownGenerator` with `PruningContentFilter`. The `PruningContentFilter` attempts to remove boilerplate and keep main content. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode from crawl4ai.content_filter_strategy import PruningContentFilter async def md_with_pruning_filter(): pruning_filter = PruningContentFilter() # By default, raw_markdown is from cleaned_html, fit_markdown is from fit_html md_generator = DefaultMarkdownGenerator(content_filter=pruning_filter) config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://news.ycombinator.com", config=config) if result.success and result.markdown: print("--- Raw Markdown (from cleaned_html - First 200 chars) ---") print(result.markdown.raw_markdown[:200]) print("\n--- Fit Markdown (from PruningFilter's fit_html - First 200 chars) ---") print(result.markdown.fit_markdown[:200]) print("\n--- Fit HTML (Source for Fit Markdown - First 200 chars) ---") print(result.markdown.fit_html[:200]) else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(md_with_pruning_filter()) ``` --- ### 5.2. Example: `DefaultMarkdownGenerator` with `BM25ContentFilter`. `BM25ContentFilter` filters content based on relevance to a user query. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode from crawl4ai.content_filter_strategy import BM25ContentFilter async def md_with_bm25_filter(): bm25_filter = BM25ContentFilter(user_query="Python programming language features") md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter) config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: # Using a relevant page for the query result = await crawler.arun(url="https://docs.python.org/3/tutorial/classes.html", config=config) if result.success and result.markdown: print("--- Fit Markdown (from BM25Filter - First 300 chars) ---") print(result.markdown.fit_markdown[:300]) else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(md_with_bm25_filter()) ``` --- ### 5.3. Example: `DefaultMarkdownGenerator` with `LLMContentFilter`. `LLMContentFilter` uses an LLM to intelligently filter or summarize content based on instructions. (Requires API Key) ```python import asyncio import os from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, LLMConfig, CacheMode from crawl4ai.content_filter_strategy import LLMContentFilter async def md_with_llm_filter(): openai_api_key = os.getenv("OPENAI_API_KEY") if not openai_api_key: print("OPENAI_API_KEY not found. Skipping LLMContentFilter example.") return llm_config = LLMConfig(api_token=openai_api_key, provider="openai/gpt-3.5-turbo") llm_filter = LLMContentFilter( llm_config=llm_config, instruction="Summarize the main arguments presented in this Hacker News discussion thread." ) md_generator = DefaultMarkdownGenerator(content_filter=llm_filter) config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS # Fresh run for LLM ) async with AsyncWebCrawler() as crawler: # Example Hacker News discussion result = await crawler.arun(url="https://news.ycombinator.com/item?id=39000000", config=config) # A past popular item if result.success and result.markdown: print("--- Fit Markdown (from LLMContentFilter - First 500 chars) ---") print(result.markdown.fit_markdown[:500]) llm_filter.show_usage() # Show token usage else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(md_with_llm_filter()) ``` --- ### 5.4. Example: Forcing Markdown generation from `fit_html` when a filter is active. This example shows how to ensure the `raw_markdown` itself is generated from the `fit_html` (output of the filter) rather than `cleaned_html`. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode from crawl4ai.content_filter_strategy import PruningContentFilter async def md_forced_from_fit_html(): pruning_filter = PruningContentFilter() # Explicitly set content_source to "fit_html" md_generator = DefaultMarkdownGenerator( content_filter=pruning_filter, content_source="fit_html" ) config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://news.ycombinator.com", config=config) if result.success and result.markdown: print("--- Raw Markdown (forced from fit_html - First 300 chars) ---") # This raw_markdown is now generated from the output of PruningFilter print(result.markdown.raw_markdown[:300]) print("\n--- Fit HTML (Source for Raw Markdown - First 300 chars) ---") print(result.markdown.fit_html[:300]) print("\n--- Fit Markdown (should be same as Raw Markdown here - First 300 chars) ---") print(result.markdown.fit_markdown[:300]) else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(md_forced_from_fit_html()) ``` --- ### 5.5. Example: Markdown generation when no filter is active. If no `content_filter` is provided to `DefaultMarkdownGenerator`, `fit_markdown` and `fit_html` will be empty or None. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode async def md_no_filter(): md_generator = DefaultMarkdownGenerator() # No filter provided config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://example.com", config=config) if result.success and result.markdown: print("--- Raw Markdown (First 300 chars) ---") print(result.markdown.raw_markdown[:300]) print("\n--- Fit Markdown (Expected: None or empty) ---") print(result.markdown.fit_markdown) print("\n--- Fit HTML (Expected: None or empty) ---") print(result.markdown.fit_html) else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(md_no_filter()) ``` --- ## 6. Understanding `MarkdownGenerationResult` Output Fields ### 6.1. Example: Accessing all fields of `MarkdownGenerationResult`. This example demonstrates how to access all the different Markdown and HTML outputs available in the `MarkdownGenerationResult` object. ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode from crawl4ai.content_filter_strategy import PruningContentFilter # Using a filter to populate fit_html/fit_markdown async def access_all_markdown_fields(): # Setup with a filter to ensure fit_html and fit_markdown are generated content_filter = PruningContentFilter() md_generator = DefaultMarkdownGenerator( content_filter=content_filter, content_source="cleaned_html" # raw_markdown will be from cleaned_html ) config = CrawlerRunConfig( markdown_generator=md_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: # Using a content-rich page result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=config) if result.success and result.markdown: md_result = result.markdown print("--- Accessing MarkdownGenerationResult Fields ---") print(f"\n1. Raw Markdown (from '{md_generator.content_source}' - snippet):") print(md_result.raw_markdown[:300] + "...") print(f"\n2. Markdown with Citations (snippet):") print(md_result.markdown_with_citations[:300] + "...") print(f"\n3. References Markdown (snippet):") print(md_result.references_markdown[:200] + "...") print(f"\n4. Fit HTML (from ContentFilter - snippet):") if md_result.fit_html: print(md_result.fit_html[:300] + "...") else: print("None (No filter or filter produced no output)") print(f"\n5. Fit Markdown (from fit_html - snippet):") if md_result.fit_markdown: print(md_result.fit_markdown[:300] + "...") else: print("None (No filter or filter produced no output)") else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(access_all_markdown_fields()) ``` --- ## 7. Advanced and Specific Scenarios ### 7.1. Example: Handling HTML with complex table structures. `DefaultMarkdownGenerator` (via `html2text`) attempts to render HTML tables into Markdown tables. ```python from crawl4ai import DefaultMarkdownGenerator def markdown_for_tables(): generator = DefaultMarkdownGenerator() html_content = """

Product Comparison

FeatureProduct AProduct B
Price$100$120
Rating4.5 stars4.2 stars
Multi-row
Feature
Supported by Both
""" result_md = generator.generate_markdown(input_html=html_content) print("--- Markdown for Table ---") print(result_md.raw_markdown) if __name__ == "__main__": markdown_for_tables() ``` --- ### 7.2. Example: Handling HTML with code blocks. Code blocks are generally preserved in Markdown format. ```python from crawl4ai import DefaultMarkdownGenerator def markdown_for_code_blocks(): generator = DefaultMarkdownGenerator() html_content = """

Here is some Python code:


def greet(name):
    print(f"Hello, {name}!")

greet("World")
        

And an inline example_function().

""" result_md = generator.generate_markdown(input_html=html_content) print("--- Markdown for Code Blocks ---") print(result_md.raw_markdown) if __name__ == "__main__": markdown_for_code_blocks() ``` --- ### 7.3. Example: Using a custom `MarkdownGenerationStrategy` (conceptual). You can create your own Markdown generation logic by subclassing `MarkdownGenerationStrategy`. ```python import asyncio from crawl4ai import ( AsyncWebCrawler, CrawlerRunConfig, CacheMode, MarkdownGenerationStrategy, MarkdownGenerationResult ) # Define a minimal custom Markdown generator class CustomMarkdownGenerator(MarkdownGenerationStrategy): def __init__(self, prefix="CUSTOM MD: ", **kwargs): super().__init__(**kwargs) # Pass along any other options self.prefix = prefix def generate_markdown( self, input_html: str, base_url: str = "", html2text_options: dict = None, # Can be used by html2text citations: bool = True, # Standard param **kwargs # For other potential strategy-specific params ) -> MarkdownGenerationResult: # Simplified custom logic: just prefix and take a snippet # A real custom generator would do more sophisticated parsing/conversion custom_raw_md = self.prefix + input_html[:100].strip() + "..." # For simplicity, we'll just return the custom raw markdown for all fields return MarkdownGenerationResult( raw_markdown=custom_raw_md, markdown_with_citations=custom_raw_md, # No real citation logic here references_markdown="", fit_markdown=None, # Not implementing filtering here fit_html=None ) async def use_custom_markdown_generator(): custom_generator = CustomMarkdownGenerator(prefix="[MyGenerator Says]: ") config = CrawlerRunConfig( markdown_generator=custom_generator, cache_mode=CacheMode.BYPASS ) async with AsyncWebCrawler() as crawler: result = await crawler.arun(url="https://example.com", config=config) if result.success and result.markdown: print("--- Output from CustomMarkdownGenerator ---") print(result.markdown.raw_markdown) # Since our custom generator doesn't really do citations or filtering: print(f"Citations: '{result.markdown.markdown_with_citations}'") print(f"References: '{result.markdown.references_markdown}'") print(f"Fit Markdown: '{result.markdown.fit_markdown}'") else: print(f"Crawl failed: {result.error_message}") if __name__ == "__main__": asyncio.run(use_custom_markdown_generator()) ``` --- **End of Examples Document** ```