Remove dependency on Spacy model.

2024-05-17 15:08:03 +08:00
parent f85df91ca6
commit a5f9d07dbf
18 changed files with 123 additions and 83955 deletions
--- a/docs/examples/quickstart.py
+++ b/docs/examples/quickstart.py
@@ -59,12 +59,6 @@ def understanding_parameters(crawler):
    cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
    print_result(result)

-    # Retrieve raw HTML content
-    cprint("\n🔄 [bold cyan]'include_raw_html' parameter example:[/bold cyan]", True)
-    result = crawler.run(url="https://www.nbcnews.com/business", include_raw_html=False)
-    cprint("[LOG] 📦 [bold yellow]Crawl result (without raw HTML content):[/bold yellow]")
-    print_result(result)
-
 def add_chunking_strategy(crawler):
    # Adding a chunking strategy: RegexChunking
    cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
@@ -177,9 +171,19 @@ def main():

    crawler = create_crawler()
    
-    cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.", True)
+    cprint("For the rest of this guide, I set crawler.always_by_pass_cache to True to force the crawler to bypass the cache. This is to ensure that we get fresh results for each run.")
    crawler.always_by_pass_cache = True

+
+    cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
+    cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
+    result = crawler.run(
+        url="https://www.nbcnews.com/business",
+        chunking_strategy=NlpSentenceChunking()
+    )
+    cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
+    print_result(result)
+
    basic_usage(crawler)
    understanding_parameters(crawler)
    add_chunking_strategy(crawler)