In this commit, we introduce the new concept of MakrdownGenerationStrategy, which allows us to expand our future strategies to generate better markdown. Right now, we generate raw markdown as we were doing before. We have a new algorithm for fitting markdown based on BM25, and now we add the ability to refine markdown into a citation form. Our links will be extracted and replaced by a citation reference number, and then we will have reference sections at the very end; we add all the links with the descriptions. This format is more suitable for large language models. In case we don't need to pass links, we can reduce the size of the markdown significantly and also attach the list of references as a separate file to a large language model. This commit contains changes for this direction.

This commit is contained in:
UncleCode
2024-11-21 18:21:43 +08:00
parent 7047422e48
commit dbb751c8f0
12 changed files with 506 additions and 762 deletions

View File

@@ -7,14 +7,14 @@ from pathlib import Path
from typing import Optional, List, Union
import json
import asyncio
from .models import CrawlResult
from .models import CrawlResult, MarkdownGenerationResult
from .async_database import async_db_manager
from .chunking_strategy import *
from .content_filter_strategy import *
from .extraction_strategy import *
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
from .content_scrapping_strategy import WebScrapingStrategy
from .content_scraping_strategy import WebScrapingStrategy
from .async_logger import AsyncLogger
from .config import (
@@ -476,7 +476,7 @@ class AsyncWebCrawler:
html,
word_count_threshold=word_count_threshold,
css_selector=css_selector,
only_text=kwargs.get("only_text", False),
only_text=kwargs.pop("only_text", False),
image_description_min_word_threshold=kwargs.get(
"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
),
@@ -491,6 +491,8 @@ class AsyncWebCrawler:
except Exception as e:
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
markdown_v2: MarkdownGenerationResult = result.get("markdown_v2", None)
cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
markdown = sanitize_input_encode(result.get("markdown", ""))
fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
@@ -542,6 +544,7 @@ class AsyncWebCrawler:
url=url,
html=html,
cleaned_html=format_html(cleaned_html),
markdown_v2=markdown_v2,
markdown=markdown,
fit_markdown=fit_markdown,
fit_html= fit_html,