Rewrote sync_microsoft_skills.py (v4) to use each SKILL.md's frontmatter 'name' field as the flat directory name under skills/, replacing the nested skills/official/microsoft/<lang>/<category>/<service>/ hierarchy. This fixes CI failures caused by the indexing, validation, and catalog scripts expecting skills/<id>/SKILL.md (depth 1). Changes: - Rewrite scripts/sync_microsoft_skills.py for flat output with collision detection - Update scripts/tests/inspect_microsoft_repo.py for flat name mapping - Update scripts/tests/test_comprehensive_coverage.py for name uniqueness checks - Delete skills/official/ nested directory - Add 129 Microsoft skills as flat directories (e.g. skills/azure-mgmt-botservice-dotnet/) - Move attribution files to docs/ (LICENSE-MICROSOFT, microsoft-skills-attribution.json) - Rebuild skills_index.json, CATALOG.md, README.md (845 total skills)
220 lines
5.8 KiB
Markdown
220 lines
5.8 KiB
Markdown
---
|
|
name: azure-storage-blob-py
|
|
description: |
|
|
Azure Blob Storage SDK for Python. Use for uploading, downloading, listing blobs, managing containers, and blob lifecycle.
|
|
Triggers: "blob storage", "BlobServiceClient", "ContainerClient", "BlobClient", "upload blob", "download blob".
|
|
package: azure-storage-blob
|
|
---
|
|
|
|
# Azure Blob Storage SDK for Python
|
|
|
|
Client library for Azure Blob Storage — object storage for unstructured data.
|
|
|
|
## Installation
|
|
|
|
```bash
|
|
pip install azure-storage-blob azure-identity
|
|
```
|
|
|
|
## Environment Variables
|
|
|
|
```bash
|
|
AZURE_STORAGE_ACCOUNT_NAME=<your-storage-account>
|
|
# Or use full URL
|
|
AZURE_STORAGE_ACCOUNT_URL=https://<account>.blob.core.windows.net
|
|
```
|
|
|
|
## Authentication
|
|
|
|
```python
|
|
from azure.identity import DefaultAzureCredential
|
|
from azure.storage.blob import BlobServiceClient
|
|
|
|
credential = DefaultAzureCredential()
|
|
account_url = "https://<account>.blob.core.windows.net"
|
|
|
|
blob_service_client = BlobServiceClient(account_url, credential=credential)
|
|
```
|
|
|
|
## Client Hierarchy
|
|
|
|
| Client | Purpose | Get From |
|
|
|--------|---------|----------|
|
|
| `BlobServiceClient` | Account-level operations | Direct instantiation |
|
|
| `ContainerClient` | Container operations | `blob_service_client.get_container_client()` |
|
|
| `BlobClient` | Single blob operations | `container_client.get_blob_client()` |
|
|
|
|
## Core Workflow
|
|
|
|
### Create Container
|
|
|
|
```python
|
|
container_client = blob_service_client.get_container_client("mycontainer")
|
|
container_client.create_container()
|
|
```
|
|
|
|
### Upload Blob
|
|
|
|
```python
|
|
# From file path
|
|
blob_client = blob_service_client.get_blob_client(
|
|
container="mycontainer",
|
|
blob="sample.txt"
|
|
)
|
|
|
|
with open("./local-file.txt", "rb") as data:
|
|
blob_client.upload_blob(data, overwrite=True)
|
|
|
|
# From bytes/string
|
|
blob_client.upload_blob(b"Hello, World!", overwrite=True)
|
|
|
|
# From stream
|
|
import io
|
|
stream = io.BytesIO(b"Stream content")
|
|
blob_client.upload_blob(stream, overwrite=True)
|
|
```
|
|
|
|
### Download Blob
|
|
|
|
```python
|
|
blob_client = blob_service_client.get_blob_client(
|
|
container="mycontainer",
|
|
blob="sample.txt"
|
|
)
|
|
|
|
# To file
|
|
with open("./downloaded.txt", "wb") as file:
|
|
download_stream = blob_client.download_blob()
|
|
file.write(download_stream.readall())
|
|
|
|
# To memory
|
|
download_stream = blob_client.download_blob()
|
|
content = download_stream.readall() # bytes
|
|
|
|
# Read into existing buffer
|
|
stream = io.BytesIO()
|
|
num_bytes = blob_client.download_blob().readinto(stream)
|
|
```
|
|
|
|
### List Blobs
|
|
|
|
```python
|
|
container_client = blob_service_client.get_container_client("mycontainer")
|
|
|
|
# List all blobs
|
|
for blob in container_client.list_blobs():
|
|
print(f"{blob.name} - {blob.size} bytes")
|
|
|
|
# List with prefix (folder-like)
|
|
for blob in container_client.list_blobs(name_starts_with="logs/"):
|
|
print(blob.name)
|
|
|
|
# Walk blob hierarchy (virtual directories)
|
|
for item in container_client.walk_blobs(delimiter="/"):
|
|
if item.get("prefix"):
|
|
print(f"Directory: {item['prefix']}")
|
|
else:
|
|
print(f"Blob: {item.name}")
|
|
```
|
|
|
|
### Delete Blob
|
|
|
|
```python
|
|
blob_client.delete_blob()
|
|
|
|
# Delete with snapshots
|
|
blob_client.delete_blob(delete_snapshots="include")
|
|
```
|
|
|
|
## Performance Tuning
|
|
|
|
```python
|
|
# Configure chunk sizes for large uploads/downloads
|
|
blob_client = BlobClient(
|
|
account_url=account_url,
|
|
container_name="mycontainer",
|
|
blob_name="large-file.zip",
|
|
credential=credential,
|
|
max_block_size=4 * 1024 * 1024, # 4 MiB blocks
|
|
max_single_put_size=64 * 1024 * 1024 # 64 MiB single upload limit
|
|
)
|
|
|
|
# Parallel upload
|
|
blob_client.upload_blob(data, max_concurrency=4)
|
|
|
|
# Parallel download
|
|
download_stream = blob_client.download_blob(max_concurrency=4)
|
|
```
|
|
|
|
## SAS Tokens
|
|
|
|
```python
|
|
from datetime import datetime, timedelta, timezone
|
|
from azure.storage.blob import generate_blob_sas, BlobSasPermissions
|
|
|
|
sas_token = generate_blob_sas(
|
|
account_name="<account>",
|
|
container_name="mycontainer",
|
|
blob_name="sample.txt",
|
|
account_key="<account-key>", # Or use user delegation key
|
|
permission=BlobSasPermissions(read=True),
|
|
expiry=datetime.now(timezone.utc) + timedelta(hours=1)
|
|
)
|
|
|
|
# Use SAS token
|
|
blob_url = f"https://<account>.blob.core.windows.net/mycontainer/sample.txt?{sas_token}"
|
|
```
|
|
|
|
## Blob Properties and Metadata
|
|
|
|
```python
|
|
# Get properties
|
|
properties = blob_client.get_blob_properties()
|
|
print(f"Size: {properties.size}")
|
|
print(f"Content-Type: {properties.content_settings.content_type}")
|
|
print(f"Last modified: {properties.last_modified}")
|
|
|
|
# Set metadata
|
|
blob_client.set_blob_metadata(metadata={"category": "logs", "year": "2024"})
|
|
|
|
# Set content type
|
|
from azure.storage.blob import ContentSettings
|
|
blob_client.set_http_headers(
|
|
content_settings=ContentSettings(content_type="application/json")
|
|
)
|
|
```
|
|
|
|
## Async Client
|
|
|
|
```python
|
|
from azure.identity.aio import DefaultAzureCredential
|
|
from azure.storage.blob.aio import BlobServiceClient
|
|
|
|
async def upload_async():
|
|
credential = DefaultAzureCredential()
|
|
|
|
async with BlobServiceClient(account_url, credential=credential) as client:
|
|
blob_client = client.get_blob_client("mycontainer", "sample.txt")
|
|
|
|
with open("./file.txt", "rb") as data:
|
|
await blob_client.upload_blob(data, overwrite=True)
|
|
|
|
# Download async
|
|
async def download_async():
|
|
async with BlobServiceClient(account_url, credential=credential) as client:
|
|
blob_client = client.get_blob_client("mycontainer", "sample.txt")
|
|
|
|
stream = await blob_client.download_blob()
|
|
data = await stream.readall()
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
1. **Use DefaultAzureCredential** instead of connection strings
|
|
2. **Use context managers** for async clients
|
|
3. **Set `overwrite=True`** explicitly when re-uploading
|
|
4. **Use `max_concurrency`** for large file transfers
|
|
5. **Prefer `readinto()`** over `readall()` for memory efficiency
|
|
6. **Use `walk_blobs()`** for hierarchical listing
|
|
7. **Set appropriate content types** for web-served blobs
|