Batch Processing
Learn how to efficiently process large numbers of documents with Structurify.
Overview
Structurify processes documents in batches within a project. Upload all your documents to a project, then run a single extraction job to process them all.
Workflow
- Create a project with the appropriate template
- Upload all documents to the project
- Run one extraction job (processes all documents)
- Export all results at once
Example: Process 100 Invoices
- Python
- Node.js
import os
from pathlib import Path
from structurify import Structurify
client = Structurify(api_key=os.environ['STRUCTURIFY_API_KEY'])
# Create project
project = client.projects.create(
name="Q1 2026 Invoices",
template_id="tpl_invoice"
)
# Upload all invoices
invoice_dir = Path("./invoices")
for pdf in invoice_dir.glob("*.pdf"):
doc = client.documents.upload(
project_id=project["id"],
file_path=str(pdf)
)
print(f"Uploaded: {doc['name']}")
# Run extraction (one job for all documents)
job = client.extraction.run(project_id=project["id"])
# Wait for all to complete
completed = client.extraction.wait_for_completion(
job["id"],
timeout=600 # 10 minutes for large batches
)
print(f"Completed: {completed['completedTasks']}")
print(f"Failed: {completed['failedTasks']}")
# Export all results
export = client.exports.create(
project_id=project["id"],
format="csv"
)
csv_data = client.exports.download(export["export"]["id"])
import { Structurify } from '@structurify/sdk';
import * as fs from 'fs';
import * as path from 'path';
const client = new Structurify({
apiKey: process.env.STRUCTURIFY_API_KEY!,
});
async function batchProcess() {
// Create project
const project = await client.projects.create({
name: 'Q1 2026 Invoices',
templateId: 'tpl_invoice',
});
// Upload all invoices
const invoiceDir = './invoices';
const files = fs.readdirSync(invoiceDir).filter(f => f.endsWith('.pdf'));
for (const file of files) {
const filePath = path.join(invoiceDir, file);
const doc = await client.documents.upload({
projectId: project.id,
file: new Blob([fs.readFileSync(filePath)]),
name: file,
});
console.log(`Uploaded: ${doc.name}`);
}
// Run extraction (one job for all documents)
const job = await client.extraction.run({ projectId: project.id });
// Wait for all to complete
const completed = await client.extraction.waitForCompletion(job.id, {
timeout: 600000, // 10 minutes
});
console.log(`Completed: ${completed.completedTasks}`);
console.log(`Failed: ${completed.failedTasks}`);
// Export all results
const exportResult = await client.exports.create({
projectId: project.id,
format: 'csv',
});
return client.exports.download(exportResult.export.id);
}
Performance Tips
Parallel Uploads
Speed up uploads by running them concurrently:
import asyncio
from structurify import Structurify
async def upload_documents(project_id: str, files: list):
tasks = []
for file_path in files:
task = asyncio.create_task(
upload_single(project_id, file_path)
)
tasks.append(task)
return await asyncio.gather(*tasks)
Chunking Large Batches
For very large batches (1000+ documents), split into multiple projects:
def chunk_list(lst, chunk_size):
for i in range(0, len(lst), chunk_size):
yield lst[i:i + chunk_size]
files = list(Path("./documents").glob("*.pdf"))
for i, chunk in enumerate(chunk_list(files, 100)):
project = client.projects.create(
name=f"Batch {i+1}",
template_id="tpl_invoice"
)
# Upload and process chunk...
Credit Usage
- 1 credit = 1 document extracted
- Credits are consumed when extraction runs
- Failed extractions don't consume credits
Check your credit balance before large batches:
# Check in dashboard: https://app.structurify.ai
# Or handle InsufficientCreditsError
Progress Monitoring
Monitor extraction progress:
import time
job = client.extraction.run(project_id=project["id"])
while True:
status = client.extraction.get(job["id"])
progress = status.get("progress", 0)
completed = status.get("completedTasks", 0)
total = status.get("totalTasks", 0)
print(f"Progress: {progress}% ({completed}/{total})")
if status["status"] in ["done", "error", "cancelled"]:
break
time.sleep(5)
Handling Failures
Some documents may fail extraction. Check the job results:
completed = client.extraction.wait_for_completion(job["id"])
if completed["failedTasks"] > 0:
print(f"Warning: {completed['failedTasks']} documents failed")
# Log failures for manual review