fix: Simplify PDF filling - use fillable fields directly
The previous approach was overcomplicated with LaTeX templates. Now the system simply: 1. Detects PDF form fields (AcroForm) 2. Sends field names to Gemini for data extraction 3. Gemini returns 'key' matching exact PDF field names 4. Fields are filled directly in the original PDF Removed: - LaTeX template detection logic - G2210-specific field definitions - Complex mode switching The fillable PDF approach is simpler and more reliable. https://claude.ai/code/session_016pQhdznHZ74Fpkvwr3cLBq
This commit is contained in:
parent
19e96ef59b
commit
f7f899dce7
2 changed files with 104 additions and 367 deletions
|
|
@ -2,7 +2,6 @@ import { GoogleGenAI, Type, Schema } from "@google/genai";
|
|||
import { FileData, FormResponse } from "../types";
|
||||
import { PdfFieldInfo } from "./pdfService";
|
||||
import { getApiKey } from "./apiKeyService";
|
||||
import { detectTemplate, getExpectedFields } from "./latexService";
|
||||
|
||||
const getAI = () => {
|
||||
const apiKey = getApiKey();
|
||||
|
|
@ -76,98 +75,12 @@ const responseSchema: Schema = {
|
|||
required: ["fields", "summary"]
|
||||
};
|
||||
|
||||
// G2210-11 specific field definitions for better extraction
|
||||
const G2210_FIELDS = `
|
||||
REQUIRED FIELDS FOR G2210-11 (Ärztlicher Befundbericht):
|
||||
Extract ALL of the following fields from the source document:
|
||||
|
||||
PATIENT DATA:
|
||||
- Versicherungsnummer (e.g., "12 345678 A 123")
|
||||
- ABT.-Nr. (Aktenzeichen/Abteilungsnummer)
|
||||
- Name, Vorname (Full name: "Nachname, Vorname")
|
||||
- Geburtsdatum (format: DD.MM.YYYY)
|
||||
- Geschlecht (männlich/weiblich/divers)
|
||||
- Straße, Hausnummer
|
||||
- PLZ
|
||||
- Ort
|
||||
- Telefon
|
||||
- Krankenkasse
|
||||
|
||||
EMPLOYMENT:
|
||||
- Derzeitige Tätigkeit (Beruf)
|
||||
- Arbeitgeber
|
||||
- Arbeitsunfähig seit (date: DD.MM.YYYY)
|
||||
- Letzte Arbeitsaufnahme
|
||||
|
||||
DIAGNOSES (up to 6, with ICD-10 codes):
|
||||
- Diagnose 1 + Diagnose 1 ICD
|
||||
- Diagnose 2 + Diagnose 2 ICD
|
||||
- Diagnose 3 + Diagnose 3 ICD
|
||||
- Diagnose 4 + Diagnose 4 ICD
|
||||
- Diagnose 5 + Diagnose 5 ICD
|
||||
- Diagnose 6 + Diagnose 6 ICD
|
||||
|
||||
ANAMNESIS:
|
||||
- Anamnese/Beschwerden (patient symptoms and history)
|
||||
- Krankheitsverlauf (disease progression, previous treatments)
|
||||
- Körperlicher Befund (physical examination findings)
|
||||
|
||||
FUNCTIONAL LIMITATIONS (mark as "keine", "gering", or "erheblich"):
|
||||
- Mobilität keine/gering/erheblich
|
||||
- Selbstversorgung keine/gering/erheblich
|
||||
- Haushaltsführung keine/gering/erheblich
|
||||
- Erwerbstätigkeit keine/gering/erheblich
|
||||
- Kommunikation keine/gering/erheblich
|
||||
- Psychische Belastbarkeit keine/gering/erheblich
|
||||
- Beeinträchtigungen Erläuterung
|
||||
|
||||
MEDICATION (up to 5):
|
||||
- Medikament 1 + Medikament 1 Dosis + Medikament 1 Seit
|
||||
- Medikament 2 + Medikament 2 Dosis + Medikament 2 Seit
|
||||
- Medikament 3 + Medikament 3 Dosis + Medikament 3 Seit
|
||||
- Medikament 4 + Medikament 4 Dosis + Medikament 4 Seit
|
||||
- Medikament 5 + Medikament 5 Dosis + Medikament 5 Seit
|
||||
- Physikalische Therapie
|
||||
|
||||
PREVIOUS REHABILITATION:
|
||||
- Reha 1 Zeitraum + Reha 1 Einrichtung + Reha 1 Erfolg
|
||||
- Reha 2 Zeitraum + Reha 2 Einrichtung + Reha 2 Erfolg
|
||||
|
||||
ASSESSMENT:
|
||||
- Leistungsvermögen (vollschichtig/3-6 Stunden/unter 3 Stunden)
|
||||
- Rehabilitationsbedürftigkeit (reasoning for rehab need)
|
||||
- Rehabilitationsziel
|
||||
- Rehabilitationsform (stationär/ambulant/ganztägig ambulant)
|
||||
- Reha Einrichtung Empfehlung
|
||||
|
||||
TRAVEL:
|
||||
- Reisefähig (ja/nein)
|
||||
- Reisefähig Begründung (if no)
|
||||
- Begleitperson (ja/nein)
|
||||
|
||||
ADDITIONAL:
|
||||
- Ergänzende Angaben
|
||||
|
||||
DOCTOR INFORMATION:
|
||||
- Arzt Name
|
||||
- Facharztbezeichnung
|
||||
- Praxis Anschrift
|
||||
- Praxis Telefon
|
||||
- BSNR
|
||||
- LANR
|
||||
- Unterschrift Datum
|
||||
`;
|
||||
|
||||
export const processDocuments = async (
|
||||
blankForm: FileData,
|
||||
sourceDocument: FileData,
|
||||
pdfFields: PdfFieldInfo[] = []
|
||||
): Promise<FormResponse> => {
|
||||
|
||||
// Detect if we have a known template
|
||||
const detectedTemplate = detectTemplate(blankForm.file?.name || '');
|
||||
const expectedFields = detectedTemplate ? getExpectedFields(detectedTemplate) : [];
|
||||
|
||||
const formPart = {
|
||||
inlineData: {
|
||||
data: blankForm.base64,
|
||||
|
|
@ -183,73 +96,53 @@ export const processDocuments = async (
|
|||
};
|
||||
|
||||
let systemPrompt = `
|
||||
ROLE: Intelligent Document Processing AI (Verification Expert).
|
||||
TASK: Extract data from the SOURCE DOCUMENT and map it to the BLANK TARGET FORM.
|
||||
ROLE: Intelligent Document Processing AI.
|
||||
TASK: Extract data from the SOURCE DOCUMENT and fill the BLANK TARGET FORM.
|
||||
|
||||
CRITICAL INSTRUCTION: You must verify every extraction. If a value is ambiguous, plausibility is low, or you are guessing, set validation.status to 'WARNING' and explain why in validation.message.
|
||||
CRITICAL: You must verify every extraction. If uncertain, set validation.status to 'WARNING'.
|
||||
`;
|
||||
|
||||
// Add template-specific instructions
|
||||
if (detectedTemplate === 'G2210-11') {
|
||||
systemPrompt += `
|
||||
DETECTED FORM: G2210-11 (Ärztlicher Befundbericht der DRV Westfalen)
|
||||
|
||||
${G2210_FIELDS}
|
||||
|
||||
IMPORTANT INSTRUCTIONS:
|
||||
1. Extract ALL fields listed above, even if they are empty in the source.
|
||||
2. Use the EXACT label names as listed above for each field.
|
||||
3. For multi-value fields like diagnoses and medications, create separate field entries.
|
||||
4. For checkbox fields (Mobilität, Selbstversorgung, etc.), return separate fields for each option.
|
||||
Example: If mobility is "erheblich", return:
|
||||
- "Mobilität keine" with value ""
|
||||
- "Mobilität gering" with value ""
|
||||
- "Mobilität erheblich" with value "true"
|
||||
5. ICD-10 codes must be in standard format (e.g., "M54.5", "F32.1")
|
||||
6. Dates must be in DD.MM.YYYY format.
|
||||
7. For Leistungsvermögen, return separate checkbox fields:
|
||||
- "Leistungsvermögen vollschichtig" (true/false)
|
||||
- "Leistungsvermögen 3-6 Stunden" (true/false)
|
||||
- "Leistungsvermögen unter 3 Stunden" (true/false)
|
||||
`;
|
||||
} else if (pdfFields.length > 0) {
|
||||
const fieldList = pdfFields.map(f => `"${f.name}" (${f.type})`).join(", ");
|
||||
// PRIORITY 1: If PDF has fillable fields, USE THEM - this is the simplest and best approach
|
||||
if (pdfFields.length > 0) {
|
||||
const fieldList = pdfFields.map(f => `"${f.name}" (${f.type})`).join("\n- ");
|
||||
systemPrompt += `
|
||||
MODE: FILLABLE PDF (AcroForm).
|
||||
The target form has specific embedded fields.
|
||||
Map extracted data to these exact field IDs: [${fieldList}].
|
||||
Return the 'key' property matching the field ID.
|
||||
`;
|
||||
} else if (expectedFields.length > 0) {
|
||||
systemPrompt += `
|
||||
MODE: TEMPLATE-BASED EXTRACTION.
|
||||
Extract the following specific fields: [${expectedFields.join(", ")}].
|
||||
Use these exact label names in your response.
|
||||
|
||||
The target PDF has these EXACT fillable fields:
|
||||
- ${fieldList}
|
||||
|
||||
CRITICAL INSTRUCTIONS:
|
||||
1. For EACH field listed above, extract the corresponding value from the SOURCE DOCUMENT.
|
||||
2. Return the 'key' property with the EXACT field name from the list above.
|
||||
3. The 'label' should be a human-readable description.
|
||||
4. For checkboxes: use value "true" to check, "false" to uncheck.
|
||||
5. For text fields: use the extracted text value.
|
||||
|
||||
You MUST return a field entry for each PDF field listed above.
|
||||
The 'key' MUST match exactly one of the field names I provided.
|
||||
`;
|
||||
} else {
|
||||
// FALLBACK: Visual overlay mode for non-fillable PDFs
|
||||
systemPrompt += `
|
||||
MODE: VISUAL FILLING (Flat/XFA/Scan).
|
||||
The target form DOES NOT have accessible digital fields.
|
||||
You must VISUALLY locate where the text should be written.
|
||||
MODE: VISUAL FILLING (Flat PDF/Scan).
|
||||
The target form does NOT have digital form fields.
|
||||
|
||||
For every field you identify on the TARGET FORM:
|
||||
1. Extract the corresponding value from the SOURCE DOCUMENT.
|
||||
2. Estimate the VISUAL COORDINATES [pageIndex, x, y] where the text should start.
|
||||
- 'x' and 'y' are on a scale of 0 to 1000.
|
||||
- (0,0) is the top-left corner of the page.
|
||||
2. Estimate VISUAL COORDINATES [pageIndex, x, y] where the text should be written.
|
||||
- x and y are on a scale of 0 to 1000.
|
||||
- (0,0) is the top-left corner.
|
||||
- (1000,1000) is the bottom-right corner.
|
||||
- Align text slightly above lines or inside boxes.
|
||||
|
||||
For checkboxes: If true/yes, the value should be "X" placed inside the box.
|
||||
For checkboxes: value should be "X" if checked.
|
||||
`;
|
||||
}
|
||||
|
||||
systemPrompt += `
|
||||
VALIDATION RULES:
|
||||
1. Dates: Ensure format matches the form (e.g. DD.MM.YYYY).
|
||||
2. Checkboxes: Only mark if explicitly supported by source.
|
||||
3. Missing Data: If a field is not found in source, leave 'value' empty and set status 'VALID'. Do not hallucinate.
|
||||
4. Source Context: Always populate 'sourceContext' with the exact text snippet from the source document that justifies your extraction.
|
||||
1. Dates: German format DD.MM.YYYY
|
||||
2. Missing Data: Leave 'value' empty, don't hallucinate.
|
||||
3. Source Context: Include the exact text snippet from source that justifies the extraction.
|
||||
`;
|
||||
|
||||
try {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue