This commit introduces significant updates to the LinkedIn data discovery documentation by adding two new Jupyter notebooks that provide detailed insights into data discovery processes. The previous workshop notebook has been removed to streamline the content and avoid redundancy. Additionally, the URL seeder documentation has been expanded with a new tutorial and several enhancements to existing scripts, improving usability and clarity. The changes include: - Added and for comprehensive LinkedIn data discovery. - Removed to eliminate outdated content. - Updated to reflect new data visualization requirements. - Introduced and to facilitate easier access to URL seeding techniques. - Enhanced existing Python scripts and markdown files in the URL seeder section for better documentation and examples. These changes aim to improve the overall documentation quality and user experience for developers working with LinkedIn data and URL seeding techniques.
155 lines
4.6 KiB
Python
155 lines
4.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Convert Crawl4AI URL Seeder tutorial markdown to Colab notebook format
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
from pathlib import Path
|
|
|
|
|
|
def parse_markdown_to_cells(markdown_content):
|
|
"""Parse markdown content and convert to notebook cells"""
|
|
cells = []
|
|
|
|
# Split content by cell markers
|
|
lines = markdown_content.split('\n')
|
|
|
|
# Extract the header content before first cell marker
|
|
header_lines = []
|
|
i = 0
|
|
while i < len(lines) and not lines[i].startswith('# cell'):
|
|
header_lines.append(lines[i])
|
|
i += 1
|
|
|
|
# Add header as markdown cell if it exists
|
|
if header_lines:
|
|
header_content = '\n'.join(header_lines).strip()
|
|
if header_content:
|
|
cells.append({
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": header_content.split('\n')
|
|
})
|
|
|
|
# Process cells marked with # cell X type:Y
|
|
current_cell_content = []
|
|
current_cell_type = None
|
|
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Check for cell marker
|
|
cell_match = re.match(r'^# cell (\d+) type:(markdown|code)$', line)
|
|
|
|
if cell_match:
|
|
# Save previous cell if exists
|
|
if current_cell_content and current_cell_type:
|
|
content = '\n'.join(current_cell_content).strip()
|
|
if content:
|
|
if current_cell_type == 'code':
|
|
cells.append({
|
|
"cell_type": "code",
|
|
"execution_count": None,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": content.split('\n')
|
|
})
|
|
else:
|
|
cells.append({
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": content.split('\n')
|
|
})
|
|
|
|
# Start new cell
|
|
current_cell_type = cell_match.group(2)
|
|
current_cell_content = []
|
|
else:
|
|
# Add line to current cell
|
|
current_cell_content.append(line)
|
|
|
|
i += 1
|
|
|
|
# Add last cell if exists
|
|
if current_cell_content and current_cell_type:
|
|
content = '\n'.join(current_cell_content).strip()
|
|
if content:
|
|
if current_cell_type == 'code':
|
|
cells.append({
|
|
"cell_type": "code",
|
|
"execution_count": None,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": content.split('\n')
|
|
})
|
|
else:
|
|
cells.append({
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": content.split('\n')
|
|
})
|
|
|
|
return cells
|
|
|
|
|
|
def create_colab_notebook(cells):
|
|
"""Create a Colab notebook structure"""
|
|
notebook = {
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Crawl4AI_URL_Seeder_Tutorial.ipynb",
|
|
"provenance": [],
|
|
"collapsed_sections": [],
|
|
"toc_visible": True
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"cells": cells
|
|
}
|
|
|
|
return notebook
|
|
|
|
|
|
def main():
|
|
# Read the markdown file
|
|
md_path = Path("tutorial_url_seeder.md")
|
|
|
|
if not md_path.exists():
|
|
print(f"Error: {md_path} not found!")
|
|
return
|
|
|
|
print(f"Reading {md_path}...")
|
|
with open(md_path, 'r', encoding='utf-8') as f:
|
|
markdown_content = f.read()
|
|
|
|
# Parse markdown to cells
|
|
print("Parsing markdown content...")
|
|
cells = parse_markdown_to_cells(markdown_content)
|
|
print(f"Created {len(cells)} cells")
|
|
|
|
# Create notebook
|
|
print("Creating Colab notebook...")
|
|
notebook = create_colab_notebook(cells)
|
|
|
|
# Save notebook
|
|
output_path = Path("Crawl4AI_URL_Seeder_Tutorial.ipynb")
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(notebook, f, indent=2, ensure_ascii=False)
|
|
|
|
print(f"✅ Successfully created {output_path}")
|
|
print(f" - Total cells: {len(cells)}")
|
|
print(f" - Markdown cells: {sum(1 for c in cells if c['cell_type'] == 'markdown')}")
|
|
print(f" - Code cells: {sum(1 for c in cells if c['cell_type'] == 'code')}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |