Compare commits
2 Commits
main
...
fix/deprec
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
eca04b0368 | ||
|
|
c2c4d42be4 |
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
if el.tag in bypass_tags:
|
if el.tag in bypass_tags:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip elements inside <pre> or <code> tags where whitespace is significant
|
||||||
|
# This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
|
||||||
|
is_in_code_block = False
|
||||||
|
ancestor = el.getparent()
|
||||||
|
while ancestor is not None:
|
||||||
|
if ancestor.tag in ("pre", "code"):
|
||||||
|
is_in_code_block = True
|
||||||
|
break
|
||||||
|
ancestor = ancestor.getparent()
|
||||||
|
|
||||||
|
if is_in_code_block:
|
||||||
|
continue
|
||||||
|
|
||||||
text_content = (el.text_content() or "").strip()
|
text_content = (el.text_content() or "").strip()
|
||||||
if (
|
if (
|
||||||
len(text_content.split()) < word_count_threshold
|
len(text_content.split()) < word_count_threshold
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
|
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict
|
||||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||||
from typing import AsyncGenerator
|
from typing import AsyncGenerator
|
||||||
from typing import Generic, TypeVar
|
from typing import Generic, TypeVar
|
||||||
@@ -153,8 +153,7 @@ class CrawlResult(BaseModel):
|
|||||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||||
tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}]
|
tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}]
|
||||||
|
|
||||||
class Config:
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
arbitrary_types_allowed = True
|
|
||||||
|
|
||||||
# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
|
# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
|
||||||
# and model_dump override all exist to support a smooth transition from markdown as a string
|
# and model_dump override all exist to support a smooth transition from markdown as a string
|
||||||
@@ -332,8 +331,7 @@ class AsyncCrawlResponse(BaseModel):
|
|||||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||||
|
|
||||||
class Config:
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
arbitrary_types_allowed = True
|
|
||||||
|
|
||||||
###############################
|
###############################
|
||||||
# Scraping Models
|
# Scraping Models
|
||||||
|
|||||||
Reference in New Issue
Block a user