Skip to main content

Parsing Documents

You can parse various Brazilian fiscal documents, US passports, US driver licenses, and bank statements using the abstra.ai module. The AI-powered OCR extracts structured data from PDFs and images automatically.

Quick Start

from abstra.ai import parse_boleto, parse_nfse, parse_nfe, parse_us_passport, parse_us_driver_license, parse_bank_statement, parse_nfe, parse_us_passport
from abstra.forms import FileInput, run

# Basic document parsing
upload_page = [FileInput(label="Upload Document", key="document")]
document_file = run([upload_page])["document"]

# Parse based on document type
parsed_data = parse_boleto(document_file.path) # or parse_nfse, parse_nfe, parse_us_passport, parse_us_driver_license, parse_bank_statement

US Passport Parsing

Extract personal information and document details from US passports:

from abstra.ai import parse_us_passport
from abstra.forms import FileInput, TextOutput, run
from datetime import datetime

def passport_verification_app():
# Upload form
upload_form = [
FileInput(
label="Upload US Passport (PDF, JPG, or PNG)",
key="passport_file",
accepted_formats=['.pdf', '.jpg', '.jpeg', '.png']
)
]

file_data = run([upload_form])
passport_file = file_data["passport_file"]

try:
# Parse the passport
passport = parse_us_passport(passport_file.path)

# Check if passport is expired
exp_date = datetime.strptime(passport.expiration_date, "%Y-%m-%d")
is_expired = exp_date < datetime.now()
days_until_expiry = (exp_date - datetime.now()).days

# Display results
results = [
TextOutput(f"**Full Name:** {passport.given_names} {passport.family_name}"),
TextOutput(f"**Passport Number:** {passport.document_id}"),
TextOutput(f"**Date of Birth:** {passport.date_of_birth}"),
TextOutput(f"**Issue Date:** {passport.issue_date}"),
TextOutput(f"**Expiration Date:** {passport.expiration_date}"),
]

# Add expiry warnings
if is_expired:
results.append(TextOutput("🚨 **This passport has expired!**"))
elif days_until_expiry <= 180: # 6 months
results.append(TextOutput(f"⚠️ **Passport expires in {days_until_expiry} days!**"))

# Show MRZ if available
if passport.mrz_code:
results.append(TextOutput(f"**MRZ Code:** `{passport.mrz_code}`"))

run([results])
return passport

except Exception as e:
run([TextOutput(f"❌ **Error parsing passport:** {str(e)}")])
return None

# Run the app
passport_data = passport_verification_app()

Boleto Parsing

Process Brazilian payment slips and extract banking information:

from abstra.ai import parse_boleto
from abstra.forms import FileInput, TextOutput, run
from datetime import datetime

def boleto_processor():
# Upload boleto
upload_form = [
FileInput(
label="Upload Boleto Document",
key="boleto_file",
accepted_formats=['.pdf', '.jpg', '.jpeg', '.png']
)
]

file_data = run([upload_form])
boleto_file = file_data["boleto_file"]

try:
# Parse the boleto
boleto = parse_boleto(boleto_file.path)

# Convert amount from centavos to reais
amount_reais = boleto.valor / 100

# Check if boleto is overdue
due_date = datetime.strptime(boleto.vencimento, "%Y-%m-%d")
is_overdue = due_date < datetime.now()
days_overdue = (datetime.now() - due_date).days

# Display payment information
results = [
TextOutput(f"**Beneficiary:** {boleto.beneficiario}"),
TextOutput(f"**Amount:** R$ {amount_reais:.2f}"),
TextOutput(f"**Due Date:** {boleto.vencimento}"),
TextOutput(f"**Barcode:** `{boleto.codigo_de_barras}`"),
]

# Add overdue warning
if is_overdue:
results.append(TextOutput(f"🚨 **This boleto is {days_overdue} days overdue!**"))

# Additional details if available
if boleto.nosso_numero:
results.append(TextOutput(f"**Bank Reference:** {boleto.nosso_numero}"))
if boleto.numero_documento:
results.append(TextOutput(f"**Document Number:** {boleto.numero_documento}"))
if boleto.cpf_cnpj_beneficiario:
results.append(TextOutput(f"**Beneficiary Tax ID:** {boleto.cpf_cnpj_beneficiario}"))

run([results])
return boleto

except Exception as e:
run([TextOutput(f"❌ **Error parsing boleto:** {str(e)}")])
return None

# Process boleto
boleto_data = boleto_processor()

NFSe Parsing

Extract service invoice information from Brazilian NFSe documents:

from abstra.ai import parse_nfse
from abstra.forms import FileInput, TextOutput, run

def nfse_analyzer():
# Upload NFSe
upload_form = [
FileInput(
label="Upload NFSe Document",
key="nfse_file",
accepted_formats=['.pdf', '.jpg', '.jpeg', '.png']
)
]

file_data = run([upload_form])
nfse_file = file_data["nfse_file"]

try:
# Parse the NFSe
nfse = parse_nfse(nfse_file.path)

# Convert amounts from centavos to reais
net_amount = nfse.valor_liquido_centavos / 100 if nfse.valor_liquido_centavos else 0
total_amount = nfse.valor_total_centavos / 100 if nfse.valor_total_centavos else 0

# Display service invoice information
results = [
TextOutput("## 📋 Service Invoice Details"),
TextOutput(f"**Invoice Number:** {nfse.numero_nota}"),
TextOutput(f"**Issue Date:** {nfse.data_emissao}"),
TextOutput(f"**Net Amount:** R$ {net_amount:.2f}"),
TextOutput(f"**Total Amount:** R$ {total_amount:.2f}"),
]

# Service provider information
if nfse.razao_social_prestador:
results.extend([
TextOutput("## 🏢 Service Provider"),
TextOutput(f"**Company:** {nfse.razao_social_prestador}"),
TextOutput(f"**CNPJ:** {nfse.cnpj_prestador}"),
])
if nfse.endereco_prestador:
results.append(TextOutput(f"**Address:** {nfse.endereco_prestador}"))
if nfse.email_prestador:
results.append(TextOutput(f"**Email:** {nfse.email_prestador}"))

# Service recipient information
if nfse.razao_social_tomador:
results.extend([
TextOutput("## 👤 Service Recipient"),
TextOutput(f"**Company:** {nfse.razao_social_tomador}"),
TextOutput(f"**CNPJ:** {nfse.cnpj_tomador}"),
])
if nfse.endereco_tomador:
results.append(TextOutput(f"**Address:** {nfse.endereco_tomador}"))

# Service description
if nfse.descricao:
results.extend([
TextOutput("## 📝 Service Description"),
TextOutput(nfse.descricao)
])

run([results])
return nfse

except Exception as e:
run([TextOutput(f"❌ **Error parsing NFSe:** {str(e)}")])
return None

# Process NFSe
nfse_data = nfse_analyzer()

NFe Parsing

Process comprehensive Brazilian electronic invoices:

from abstra.ai import parse_nfe
from abstra.forms import FileInput, TextOutput, run

def nfe_processor():
# Upload NFe
upload_form = [
FileInput(
label="Upload NFe Document",
key="nfe_file",
accepted_formats=['.pdf', '.jpg', '.jpeg', '.png']
)
]

file_data = run([upload_form])
nfe_file = file_data["nfe_file"]

try:
# Parse the NFe
nfe = parse_nfe(nfe_file.path)

# Display basic invoice information
results = [
TextOutput("## 📄 Electronic Invoice (NFe)"),
TextOutput(f"**Invoice Number:** {nfe.numero_nota}"),
TextOutput(f"**Series:** {nfe.serie}"),
TextOutput(f"**Access Key:** `{nfe.chave_acesso}`"),
TextOutput(f"**Issue Date:** {nfe.data_emissao}"),
]

# Financial information
if nfe.valor_total:
results.extend([
TextOutput("## 💰 Financial Information"),
TextOutput(f"**Products Value:** R$ {nfe.valor_produtos:.2f}" if nfe.valor_produtos is not None else "N/A"),
TextOutput(f"**Total Value:** R$ {nfe.valor_total:.2f}"),
])

# Tax information
if nfe.valor_icms is not None:
results.append(TextOutput(f"**ICMS Tax:** R$ {nfe.valor_icms:.2f}"))
if nfe.valor_ipi is not None:
results.append(TextOutput(f"**IPI Tax:** R$ {nfe.valor_ipi:.2f}"))

# Issuer information
if nfe.razao_social_emitente:
results.extend([
TextOutput("## 🏭 Issuer"),
TextOutput(f"**Company:** {nfe.razao_social_emitente}"),
TextOutput(f"**CNPJ:** {nfe.cnpj_emitente}"),
TextOutput(f"**Address:** {nfe.endereco_emitente}" if nfe.endereco_emitente else "Address not available"),
])

# Recipient information
if nfe.nome_destinatario:
results.extend([
TextOutput("## 📦 Recipient"),
TextOutput(f"**Name:** {nfe.nome_destinatario}"),
TextOutput(f"**Tax ID:** {nfe.cpf_cnpj_destinatario}"),
TextOutput(f"**Address:** {nfe.endereco_destinatario}" if nfe.endereco_destinatario else "Address not available"),
])

# Product information
if nfe.descricao_produto:
results.extend([
TextOutput("## 📋 Product Details"),
TextOutput(f"**Description:** {nfe.descricao_produto}"),
TextOutput(f"**Product Code:** {nfe.codigo_produto}" if nfe.codigo_produto else "N/A"),
TextOutput(f"**NCM/SH:** {nfe.ncm_sh}" if nfe.ncm_sh else "N/A"),
TextOutput(f"**Unit Value:** R$ {nfe.valor_unitario:.2f}" if nfe.valor_unitario is not None else "N/A"),
])

# Transportation info
if nfe.razao_social_transportadora:
results.extend([
TextOutput("## 🚛 Transportation"),
TextOutput(f"**Carrier:** {nfe.razao_social_transportadora}"),
TextOutput(f"**CNPJ:** {nfe.cnpj_transportadora}"),
TextOutput(f"**Vehicle Plate:** {nfe.placa_veiculo}" if nfe.placa_veiculo else "N/A"),
])

# Additional information
if nfe.informacoes_adicionais:
results.extend([
TextOutput("## ℹ️ Additional Information"),
TextOutput(nfe.informacoes_adicionais)
])

run([results])
return nfe

except Exception as e:
run([TextOutput(f"❌ **Error parsing NFe:** {str(e)}")])
return None

# Process NFe
nfe_data = nfe_processor()

US Driver License

Extract personal information and licensing details from US driver's licenses:

from abstra.ai import parse_us_driver_license
from abstra.forms import FileInput, TextOutput, MarkdownOutput, run
from datetime import datetime

def driver_license_processor():
# Upload driver license
upload_form = [
FileInput(
label="Upload US Driver License (PDF, JPG, PNG)",
key="license_file",
accepted_formats=['.pdf', '.jpg', '.jpeg', '.png']
)
]

print("🪪 Starting driver license processing...")
file_data = run([upload_form])
license_file = file_data["license_file"]
print(f"📁 File uploaded: {license_file.name}")

# Parse the driver license using AI
print("🔍 Parsing driver license with AI...")
license_data = parse_us_driver_license(license_file.path)
print("✅ Driver license parsed successfully!")

# Display license information
results = [
MarkdownOutput("## 🪪 US Driver License Information"),
MarkdownOutput("---"),
MarkdownOutput("### 👤 Personal Information"),
TextOutput(f"**Given Names:** {license_data.given_names or 'Not specified'}"),
TextOutput(f"**Family Name:** {license_data.family_name or 'Not specified'}"),
TextOutput(f"**Date of Birth:** {license_data.date_of_birth or 'Not specified'}"),
TextOutput(f"**Address:** {license_data.address or 'Not specified'}"),
MarkdownOutput("### 📄 License Details"),
TextOutput(f"**Document ID:** {license_data.document_id or 'Not specified'}"),
TextOutput(f"**Issue Date:** {license_data.issue_date or 'Not specified'}"),
TextOutput(f"**Expiration Date:** {license_data.expiration_date or 'Not specified'}"),
]

# Check if license is expired (if expiration date is available)
try:
if license_data.expiration_date:
exp_date = license_data.expiration_date
is_expired = exp_date < datetime.now().date()
days_until_expiry = (exp_date - datetime.now().date()).days

results.append(MarkdownOutput("### ⏰ Expiration Status"))

if is_expired:
results.append(TextOutput("🚨 **This driver license has EXPIRED!**"))
results.append(TextOutput(f"**Expired:** {abs(days_until_expiry)} days ago"))
elif days_until_expiry <= 60: # 2 months
results.append(TextOutput(f"⚠️ **License expires in {days_until_expiry} days!**"))
results.append(TextOutput("Consider renewing soon."))
else:
results.append(TextOutput(f"✅ **License is valid** (expires in {days_until_expiry} days)"))
except (ValueError, AttributeError) as e:
print(f"⚠️ Could not parse expiration date: {e}")
results.append(MarkdownOutput("### ⏰ Expiration Status"))
results.append(TextOutput("⚠️ Unable to determine expiration status"))

# Log the extracted information for debugging
print(f"📊 Extracted data:")
print(f" - Name: {license_data.given_names} {license_data.family_name}")
print(f" - DOB: {license_data.date_of_birth}")
print(f" - Document ID: {license_data.document_id}")
print(f" - Expiration: {license_data.expiration_date}")

run([results])
return license_data

# Process driver license
print("🚀 Starting driver license processor...")
license_info = driver_license_processor()
if license_info:
print("🏁 Processing completed successfully!")
else:
print("❌ Processing failed.")

Bank Statement

Extract transaction details and account information from bank statements:

from abstra.ai import parse_bank_statement
from abstra.forms import FileInput, TextOutput, MarkdownOutput, run

def bank_statement_processor():
# Upload bank statement
upload_form = [
FileInput(
label="Upload Bank Statement (PDF, JPG, PNG)",
key="statement_file",
accepted_formats=['.pdf', '.jpg', '.jpeg', '.png']
)
]

print("📄 Starting bank statement processing...")
file_data = run([upload_form])
statement_file = file_data["statement_file"]
print(f"📁 File uploaded: {statement_file.name}")

# Parse the bank statement using AI
print("🔍 Parsing bank statement with AI...")
statement = parse_bank_statement(statement_file.path)
print(f"✅ Bank statement parsed successfully!")

# Display parsed bank statement information
results = [
MarkdownOutput("## 🏦 Bank Statement Summary"),
MarkdownOutput("---"),
MarkdownOutput("### 🏦 Bank Information"),
TextOutput(f"**Bank Name:** {statement.bank_name or 'Not specified'}"),
TextOutput(f"**Bank Address:** {statement.bank_address or 'Not specified'}"),
MarkdownOutput("### 👤 Account Information"),
TextOutput(f"**Client Name:** {statement.client_name or 'Not specified'}"),
TextOutput(f"**Client Address:** {statement.client_address or 'Not specified'}"),
TextOutput(f"**Account Number:** {statement.account_number or 'Not specified'}"),
TextOutput(f"**Account Type:** {statement.account_type or 'Not specified'}"),
MarkdownOutput("### 📅 Statement Period"),
TextOutput(f"**Start Date:** {statement.statement_start_date or 'Not specified'}"),
TextOutput(f"**End Date:** {statement.statement_end_date or 'Not specified'}"),
MarkdownOutput("### 💰 Balance Information"),
TextOutput(f"**Starting Balance:** {statement.starting_balance or 'Not specified'}"),
TextOutput(f"**Ending Balance:** {statement.ending_balance or 'Not specified'}"),
]

# Calculate balance change if both balances are available and numeric
try:
if statement.starting_balance and statement.ending_balance:
# Try to convert to float for calculation
start_balance = float(statement.starting_balance.replace('$', '').replace(',', '').strip())
end_balance = float(statement.ending_balance.replace('$', '').replace(',', '').strip())
balance_change = end_balance - start_balance

if balance_change > 0:
results.append(TextOutput(f"**Net Change:** +${balance_change:.2f} ✅"))
elif balance_change < 0:
results.append(TextOutput(f"**Net Change:** -${abs(balance_change):.2f} ⚠️"))
else:
results.append(TextOutput("**Net Change:** $0.00 ➡️"))
except (ValueError, AttributeError) as e:
print(f"⚠️ Could not calculate balance change: {e}")
results.append(TextOutput("**Net Change:** Unable to calculate"))

# Log the extracted information for debugging
print(f"📊 Extracted data:")
print(f" - Client: {statement.client_name}")
print(f" - Account: {statement.account_number}")
print(f" - Period: {statement.statement_start_date} to {statement.statement_end_date}")
print(f" - Balance: {statement.starting_balance}{statement.ending_balance}")

run([results])
return statement

# Process bank statement
print("🚀 Starting bank statement processor...")
statement_data = bank_statement_processor()
if statement_data:
print(f"🏁 Processing completed successfully!")
else:
print("❌ Processing failed.")

Tips for Best Results

  • Image Quality: Use high-resolution scans (300+ DPI)
  • File Formats: PDF files typically provide the best results
  • Lighting: Ensure even lighting without shadows or glare
  • Orientation: Keep documents properly oriented and flat
  • File Size: Keep files under 10MB for optimal processing speed
  • Complete Documents: Include all pages and sections of the document