#1167 Add PHP MIME types to ContentTypeFilter for better file handling
This commit is contained in:
@@ -337,6 +337,15 @@ class ContentTypeFilter(URLFilter):
|
|||||||
"sqlite": "application/vnd.sqlite3",
|
"sqlite": "application/vnd.sqlite3",
|
||||||
# Placeholder
|
# Placeholder
|
||||||
"unknown": "application/octet-stream", # Fallback for unknown file types
|
"unknown": "application/octet-stream", # Fallback for unknown file types
|
||||||
|
# php
|
||||||
|
"php": "application/x-httpd-php",
|
||||||
|
"php3": "application/x-httpd-php",
|
||||||
|
"php4": "application/x-httpd-php",
|
||||||
|
"php5": "application/x-httpd-php",
|
||||||
|
"php7": "application/x-httpd-php",
|
||||||
|
"phtml": "application/x-httpd-php",
|
||||||
|
"phps": "application/x-httpd-php-source",
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|||||||
75
tests/deep_crwaling/test_filter.py
Normal file
75
tests/deep_crwaling/test_filter.py
Normal file
@@ -0,0 +1,75 @@
|
|||||||
|
# // File: tests/deep_crawling/test_filters.py
|
||||||
|
import pytest
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
from crawl4ai import ContentTypeFilter, URLFilter
|
||||||
|
|
||||||
|
# Minimal URLFilter base class stub if not already importable directly for tests
|
||||||
|
# In a real scenario, this would be imported from the library
|
||||||
|
if not hasattr(URLFilter, '_update_stats'): # Check if it's a basic stub
|
||||||
|
class URLFilter: # Basic stub for testing if needed
|
||||||
|
def __init__(self, name=None): self.name = name
|
||||||
|
def apply(self, url: str) -> bool: raise NotImplementedError
|
||||||
|
def _update_stats(self, passed: bool): pass # Mock implementation
|
||||||
|
|
||||||
|
# Assume ContentTypeFilter is structured as discussed. If its definition is not fully
|
||||||
|
# available for direct import in the test environment, a more elaborate stub or direct
|
||||||
|
# instantiation of the real class (if possible) would be needed.
|
||||||
|
# For this example, we assume ContentTypeFilter can be imported and used.
|
||||||
|
|
||||||
|
class TestContentTypeFilter:
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, allowed_types, expected",
|
||||||
|
[
|
||||||
|
# Existing tests (examples)
|
||||||
|
("http://example.com/page.html", ["text/html"], True),
|
||||||
|
("http://example.com/page.json", ["application/json"], True),
|
||||||
|
("http://example.com/image.png", ["text/html"], False),
|
||||||
|
("http://example.com/document.pdf", ["application/pdf"], True),
|
||||||
|
("http://example.com/page", ["text/html"], True), # No extension, allowed
|
||||||
|
("http://example.com/page", ["text/html"], False), # No extension, disallowed
|
||||||
|
("http://example.com/page.unknown", ["text/html"], False), # Unknown extension
|
||||||
|
|
||||||
|
# Tests for PHP extensions
|
||||||
|
("http://example.com/index.php", ["application/x-httpd-php"], True),
|
||||||
|
("http://example.com/script.php3", ["application/x-httpd-php"], True),
|
||||||
|
("http://example.com/legacy.php4", ["application/x-httpd-php"], True),
|
||||||
|
("http://example.com/main.php5", ["application/x-httpd-php"], True),
|
||||||
|
("http://example.com/api.php7", ["application/x-httpd-php"], True),
|
||||||
|
("http://example.com/index.phtml", ["application/x-httpd-php"], True),
|
||||||
|
("http://example.com/source.phps", ["application/x-httpd-php-source"], True),
|
||||||
|
|
||||||
|
# Test rejection of PHP extensions
|
||||||
|
("http://example.com/index.php", ["text/html"], False),
|
||||||
|
("http://example.com/script.php3", ["text/plain"], False),
|
||||||
|
("http://example.com/source.phps", ["application/x-httpd-php"], False), # Mismatch MIME
|
||||||
|
("http://example.com/source.php", ["application/x-httpd-php-source"], False), # Mismatch MIME for .php
|
||||||
|
|
||||||
|
# Test case-insensitivity of extensions in URL
|
||||||
|
("http://example.com/PAGE.HTML", ["text/html"], True),
|
||||||
|
("http://example.com/INDEX.PHP", ["application/x-httpd-php"], True),
|
||||||
|
("http://example.com/SOURCE.PHPS", ["application/x-httpd-php-source"], True),
|
||||||
|
|
||||||
|
# Test case-insensitivity of allowed_types
|
||||||
|
("http://example.com/index.php", ["APPLICATION/X-HTTPD-PHP"], True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_apply(self, url, allowed_types, expected):
|
||||||
|
content_filter = ContentTypeFilter(
|
||||||
|
allowed_types=allowed_types
|
||||||
|
)
|
||||||
|
assert content_filter.apply(url) == expected
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"url, expected_extension",
|
||||||
|
[
|
||||||
|
("http://example.com/file.html", "html"),
|
||||||
|
("http://example.com/file.tar.gz", "gz"),
|
||||||
|
("http://example.com/path/", ""),
|
||||||
|
("http://example.com/nodot", ""),
|
||||||
|
("http://example.com/.config", "config"), # hidden file with extension
|
||||||
|
("http://example.com/path/to/archive.BIG.zip", "zip"), # Case test
|
||||||
|
]
|
||||||
|
)
|
||||||
|
def test_extract_extension(self, url, expected_extension):
|
||||||
|
# Test the static method directly
|
||||||
|
assert ContentTypeFilter._extract_extension(url) == expected_extension
|
||||||
Reference in New Issue
Block a user