From f6e25e2a6bae8a1b774b6e71fc98edc460d04b53 Mon Sep 17 00:00:00 2001
From: Aravind Karnam <aravind.karanam@gmail.com>
Date: Wed, 7 May 2025 17:53:30 +0530
Subject: [PATCH] fix: check_robots_txt to support wildcard rules ref: #699

---
 crawl4ai/utils.py | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py
index bfa8ce9d..4018d78c 100644
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -42,6 +42,29 @@ from itertools import chain
 from collections import deque
 from typing import  Generator, Iterable
 
+# Monkey patch to fix wildcard handling in urllib.robotparser
+from urllib.robotparser import RuleLine
+import re
+
+original_applies_to = RuleLine.applies_to
+
+def patched_applies_to(self, filename):
+   # Handle wildcards in paths
+   if '*' in self.path or '%2A' in self.path or self.path in ("*", "%2A"):
+       pattern = self.path.replace('%2A', '*')
+       pattern = re.escape(pattern).replace('\\*', '.*')
+       pattern = '^' + pattern
+       if pattern.endswith('\\$'):
+           pattern = pattern[:-2] + '$'
+       try:
+           return bool(re.match(pattern, filename))
+       except re.error:
+           return original_applies_to(self, filename)
+   return original_applies_to(self, filename)
+
+RuleLine.applies_to = patched_applies_to
+# Monkey patch ends
+
 def chunk_documents(
     documents: Iterable[str],
     chunk_token_threshold: int,
@@ -303,7 +326,7 @@ class RobotsParser:
                 robots_url = f"{scheme}://{domain}/robots.txt"
                 
                 async with aiohttp.ClientSession() as session:
-                    async with session.get(robots_url, timeout=2) as response:
+                    async with session.get(robots_url, timeout=2, ssl=False) as response:
                         if response.status == 200:
                             rules = await response.text()
                             self._cache_rules(domain, rules)