Rewrote sync_microsoft_skills.py (v4) to use each SKILL.md's frontmatter 'name' field as the flat directory name under skills/, replacing the nested skills/official/microsoft/<lang>/<category>/<service>/ hierarchy. This fixes CI failures caused by the indexing, validation, and catalog scripts expecting skills/<id>/SKILL.md (depth 1). Changes: - Rewrite scripts/sync_microsoft_skills.py for flat output with collision detection - Update scripts/tests/inspect_microsoft_repo.py for flat name mapping - Update scripts/tests/test_comprehensive_coverage.py for name uniqueness checks - Delete skills/official/ nested directory - Add 129 Microsoft skills as flat directories (e.g. skills/azure-mgmt-botservice-dotnet/) - Move attribution files to docs/ (LICENSE-MICROSOFT, microsoft-skills-attribution.json) - Rebuild skills_index.json, CATALOG.md, README.md (845 total skills)
250 lines
6.7 KiB
Markdown
250 lines
6.7 KiB
Markdown
---
|
|
name: azure-ai-translation-document-py
|
|
description: |
|
|
Azure AI Document Translation SDK for batch translation of documents with format preservation. Use for translating Word, PDF, Excel, PowerPoint, and other document formats at scale.
|
|
Triggers: "document translation", "batch translation", "translate documents", "DocumentTranslationClient".
|
|
package: azure-ai-translation-document
|
|
---
|
|
|
|
# Azure AI Document Translation SDK for Python
|
|
|
|
Client library for Azure AI Translator document translation service for batch document translation with format preservation.
|
|
|
|
## Installation
|
|
|
|
```bash
|
|
pip install azure-ai-translation-document
|
|
```
|
|
|
|
## Environment Variables
|
|
|
|
```bash
|
|
AZURE_DOCUMENT_TRANSLATION_ENDPOINT=https://<resource>.cognitiveservices.azure.com
|
|
AZURE_DOCUMENT_TRANSLATION_KEY=<your-api-key> # If using API key
|
|
|
|
# Storage for source and target documents
|
|
AZURE_SOURCE_CONTAINER_URL=https://<storage>.blob.core.windows.net/<container>?<sas>
|
|
AZURE_TARGET_CONTAINER_URL=https://<storage>.blob.core.windows.net/<container>?<sas>
|
|
```
|
|
|
|
## Authentication
|
|
|
|
### API Key
|
|
|
|
```python
|
|
import os
|
|
from azure.ai.translation.document import DocumentTranslationClient
|
|
from azure.core.credentials import AzureKeyCredential
|
|
|
|
endpoint = os.environ["AZURE_DOCUMENT_TRANSLATION_ENDPOINT"]
|
|
key = os.environ["AZURE_DOCUMENT_TRANSLATION_KEY"]
|
|
|
|
client = DocumentTranslationClient(endpoint, AzureKeyCredential(key))
|
|
```
|
|
|
|
### Entra ID (Recommended)
|
|
|
|
```python
|
|
from azure.ai.translation.document import DocumentTranslationClient
|
|
from azure.identity import DefaultAzureCredential
|
|
|
|
client = DocumentTranslationClient(
|
|
endpoint=os.environ["AZURE_DOCUMENT_TRANSLATION_ENDPOINT"],
|
|
credential=DefaultAzureCredential()
|
|
)
|
|
```
|
|
|
|
## Basic Document Translation
|
|
|
|
```python
|
|
from azure.ai.translation.document import DocumentTranslationInput, TranslationTarget
|
|
|
|
source_url = os.environ["AZURE_SOURCE_CONTAINER_URL"]
|
|
target_url = os.environ["AZURE_TARGET_CONTAINER_URL"]
|
|
|
|
# Start translation job
|
|
poller = client.begin_translation(
|
|
inputs=[
|
|
DocumentTranslationInput(
|
|
source_url=source_url,
|
|
targets=[
|
|
TranslationTarget(
|
|
target_url=target_url,
|
|
language="es" # Translate to Spanish
|
|
)
|
|
]
|
|
)
|
|
]
|
|
)
|
|
|
|
# Wait for completion
|
|
result = poller.result()
|
|
|
|
print(f"Status: {poller.status()}")
|
|
print(f"Documents translated: {poller.details.documents_succeeded_count}")
|
|
print(f"Documents failed: {poller.details.documents_failed_count}")
|
|
```
|
|
|
|
## Multiple Target Languages
|
|
|
|
```python
|
|
poller = client.begin_translation(
|
|
inputs=[
|
|
DocumentTranslationInput(
|
|
source_url=source_url,
|
|
targets=[
|
|
TranslationTarget(target_url=target_url_es, language="es"),
|
|
TranslationTarget(target_url=target_url_fr, language="fr"),
|
|
TranslationTarget(target_url=target_url_de, language="de")
|
|
]
|
|
)
|
|
]
|
|
)
|
|
```
|
|
|
|
## Translate Single Document
|
|
|
|
```python
|
|
from azure.ai.translation.document import SingleDocumentTranslationClient
|
|
|
|
single_client = SingleDocumentTranslationClient(endpoint, AzureKeyCredential(key))
|
|
|
|
with open("document.docx", "rb") as f:
|
|
document_content = f.read()
|
|
|
|
result = single_client.translate(
|
|
body=document_content,
|
|
target_language="es",
|
|
content_type="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
)
|
|
|
|
# Save translated document
|
|
with open("document_es.docx", "wb") as f:
|
|
f.write(result)
|
|
```
|
|
|
|
## Check Translation Status
|
|
|
|
```python
|
|
# Get all translation operations
|
|
operations = client.list_translation_statuses()
|
|
|
|
for op in operations:
|
|
print(f"Operation ID: {op.id}")
|
|
print(f"Status: {op.status}")
|
|
print(f"Created: {op.created_on}")
|
|
print(f"Total documents: {op.documents_total_count}")
|
|
print(f"Succeeded: {op.documents_succeeded_count}")
|
|
print(f"Failed: {op.documents_failed_count}")
|
|
```
|
|
|
|
## List Document Statuses
|
|
|
|
```python
|
|
# Get status of individual documents in a job
|
|
operation_id = poller.id
|
|
document_statuses = client.list_document_statuses(operation_id)
|
|
|
|
for doc in document_statuses:
|
|
print(f"Document: {doc.source_document_url}")
|
|
print(f" Status: {doc.status}")
|
|
print(f" Translated to: {doc.translated_to}")
|
|
if doc.error:
|
|
print(f" Error: {doc.error.message}")
|
|
```
|
|
|
|
## Cancel Translation
|
|
|
|
```python
|
|
# Cancel a running translation
|
|
client.cancel_translation(operation_id)
|
|
```
|
|
|
|
## Using Glossary
|
|
|
|
```python
|
|
from azure.ai.translation.document import TranslationGlossary
|
|
|
|
poller = client.begin_translation(
|
|
inputs=[
|
|
DocumentTranslationInput(
|
|
source_url=source_url,
|
|
targets=[
|
|
TranslationTarget(
|
|
target_url=target_url,
|
|
language="es",
|
|
glossaries=[
|
|
TranslationGlossary(
|
|
glossary_url="https://<storage>.blob.core.windows.net/glossary/terms.csv?<sas>",
|
|
file_format="csv"
|
|
)
|
|
]
|
|
)
|
|
]
|
|
)
|
|
]
|
|
)
|
|
```
|
|
|
|
## Supported Document Formats
|
|
|
|
```python
|
|
# Get supported formats
|
|
formats = client.get_supported_document_formats()
|
|
|
|
for fmt in formats:
|
|
print(f"Format: {fmt.format}")
|
|
print(f" Extensions: {fmt.file_extensions}")
|
|
print(f" Content types: {fmt.content_types}")
|
|
```
|
|
|
|
## Supported Languages
|
|
|
|
```python
|
|
# Get supported languages
|
|
languages = client.get_supported_languages()
|
|
|
|
for lang in languages:
|
|
print(f"Language: {lang.name} ({lang.code})")
|
|
```
|
|
|
|
## Async Client
|
|
|
|
```python
|
|
from azure.ai.translation.document.aio import DocumentTranslationClient
|
|
from azure.identity.aio import DefaultAzureCredential
|
|
|
|
async def translate_documents():
|
|
async with DocumentTranslationClient(
|
|
endpoint=endpoint,
|
|
credential=DefaultAzureCredential()
|
|
) as client:
|
|
poller = await client.begin_translation(inputs=[...])
|
|
result = await poller.result()
|
|
```
|
|
|
|
## Supported Formats
|
|
|
|
| Category | Formats |
|
|
|----------|---------|
|
|
| Documents | DOCX, PDF, PPTX, XLSX, HTML, TXT, RTF |
|
|
| Structured | CSV, TSV, JSON, XML |
|
|
| Localization | XLIFF, XLF, MHTML |
|
|
|
|
## Storage Requirements
|
|
|
|
- Source and target containers must be Azure Blob Storage
|
|
- Use SAS tokens with appropriate permissions:
|
|
- Source: Read, List
|
|
- Target: Write, List
|
|
|
|
## Best Practices
|
|
|
|
1. **Use SAS tokens** with minimal required permissions
|
|
2. **Monitor long-running operations** with `poller.status()`
|
|
3. **Handle document-level errors** by iterating document statuses
|
|
4. **Use glossaries** for domain-specific terminology
|
|
5. **Separate target containers** for each language
|
|
6. **Use async client** for multiple concurrent jobs
|
|
7. **Check supported formats** before submitting documents
|