* fix: Update export of URLPatternFilter * chore: Add dependancy for cchardet in requirements * docs: Update example for deep crawl in release note for v0.5 * Docs: update the example for memory dispatcher * docs: updated example for crawl strategies * Refactor: Removed wrapping in if __name__==main block since this is a markdown file. * chore: removed cchardet from dependancy list, since unclecode is planning to remove it * docs: updated the example for proxy rotation to a working example * feat: Introduced ProxyConfig param * Add tutorial for deep crawl & update contributor list for bug fixes in feb alpha-1 * chore: update and test new dependancies * feat:Make PyPDF2 a conditional dependancy * updated tutorial and release note for v0.5 * docs: update docs for deep crawl, and fix a typo in docker-deployment markdown filename * refactor: 1. Deprecate markdown_v2 2. Make markdown backward compatible to behave as a string when needed. 3. Fix LlmConfig usage in cli 4. Deprecate markdown_v2 in cli 5. Update AsyncWebCrawler for changes in CrawlResult * fix: Bug in serialisation of markdown in acache_url * Refactor: Added deprecation errors for fit_html and fit_markdown directly on markdown. Now access them via markdown * fix: remove deprecated markdown_v2 from docker * Refactor: remove deprecated fit_markdown and fit_html from result * refactor: fix cache retrieval for markdown as a string * chore: update all docs, examples and tests with deprecation announcements for markdown_v2, fit_html, fit_markdown
48 lines
1.1 KiB
Python
48 lines
1.1 KiB
Python
# deep_crawling/__init__.py
|
|
from .base_strategy import DeepCrawlDecorator, DeepCrawlStrategy
|
|
from .bfs_strategy import BFSDeepCrawlStrategy
|
|
from .bff_strategy import BestFirstCrawlingStrategy
|
|
from .dfs_strategy import DFSDeepCrawlStrategy
|
|
from .filters import (
|
|
FilterChain,
|
|
ContentTypeFilter,
|
|
DomainFilter,
|
|
URLFilter,
|
|
URLPatternFilter,
|
|
FilterStats,
|
|
ContentRelevanceFilter,
|
|
SEOFilter
|
|
)
|
|
from .scorers import (
|
|
KeywordRelevanceScorer,
|
|
URLScorer,
|
|
CompositeScorer,
|
|
DomainAuthorityScorer,
|
|
FreshnessScorer,
|
|
PathDepthScorer,
|
|
ContentTypeScorer
|
|
)
|
|
|
|
__all__ = [
|
|
"DeepCrawlDecorator",
|
|
"DeepCrawlStrategy",
|
|
"BFSDeepCrawlStrategy",
|
|
"BestFirstCrawlingStrategy",
|
|
"DFSDeepCrawlStrategy",
|
|
"FilterChain",
|
|
"ContentTypeFilter",
|
|
"DomainFilter",
|
|
"URLFilter",
|
|
"URLPatternFilter",
|
|
"FilterStats",
|
|
"ContentRelevanceFilter",
|
|
"SEOFilter",
|
|
"KeywordRelevanceScorer",
|
|
"URLScorer",
|
|
"CompositeScorer",
|
|
"DomainAuthorityScorer",
|
|
"FreshnessScorer",
|
|
"PathDepthScorer",
|
|
"ContentTypeScorer",
|
|
]
|