import streamlit as st import pypdf import google.generativeai as genai import pandas as pd import json import io # ========================================== # 1. KONFIGURASI DAN SETUP API # ========================================== st.set_page_config(page_title="PDF to Sheet Converter", page_icon="📊", layout="wide") # Mengambil API Key dari Replit Secrets (Environment Variables) # Pastikan Anda sudah menambahkan GEMINI_API_KEY di menu "Secrets" Replit Anda GEMINI_API_KEY = st.secrets.get("sk-27b0045955c84bd69458791996106eb0", "") if GEMINI_API_KEY: genai.configure(api_key=sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0) else: st.warning("⚠️ Silakan masukkan GEMINI_API_KEY Anda di menu 'Secrets' Replit untuk mengaktifkan AI.") # ========================================== # 2. FUNGSI EKSTRAKSI PDF & AI # ========================================== def extract_text_from_pdf(uploaded_file): """Membaca file PDF dan mengekstrak seluruh teksnya.""" pdf_reader = pypdf.PdfReader(uploaded_file) full_text = "" for page in pdf_reader.pages: text = page.extract_text() if text: full_text += text + "\n" return full_text def process_text_with_ai(pdf_text, column_headers): """Mengirim teks ke Gemini dengan Prompt Berintegritas & Terstruktur.""" model = genai.GenerativeModel("gemini-1.5-flash") # SYSTEM PROMPT YANG SUDAH DIOPTIMALKAN prompt = f""" You are an expert AI Data Extraction Assistant specializing in OCR and document processing. Your task is to extract the following text from a PDF document and convert it into a highly structured JSON array format ready for spreadsheet consumption. You must strictly adhere to the following data integrity and structural guidelines: 1. DATA INTEGRITY (ANTI-HALLUCINATION): - Extract data EXACTLY as it appears in the source text. Never invent, assume, extrapolate, or guess any information. - If a data point is missing, incomplete, or unreadable, leave it as null or an empty string "". Do not try to complete or predict the data. 2. DATA TYPES & STRUCTURE: - Ensure each column maintains a consistent data type. - Numerical Columns (Price, Quantity, Total, etc.): Strip away all currency symbols (e.g., $, Rp, €, USD) and non-numeric characters except for decimal points. The output must be clean numbers (Integer or Float) so they can be calculated immediately in a spreadsheet. - Date Columns: Standardize all date formats into the ISO standard format: YYYY-MM-DD. 3. OUTPUT FORMAT: - Return ONLY a valid JSON Array of Objects. Do not include any conversational text, introductions, explanations, or markdown code block wrappers (like ```json). - The required schema/keys for extraction are: {column_headers} Here is the extracted text from the PDF: {pdf_text} """ # Mengatur agar AI wajib merespon dalam format JSON bersih response = model.generate_content( prompt, generation_config={"response_mime_type": "application/json"} ) return response.text # ========================================== # 3. TAMPILAN APLIKASI (UI) # ========================================== st.title("📊 Smart PDF to Sheet Converter") st.write("Ekstrak data PDF Anda secara cerdas menjadi file Excel/CSV dengan integritas data tinggi.") # Layout Kolom Input col1, col2 = st.columns([1, 2]) with col1: st.subheader("1. Pengaturan Ekstraksi") # Input kolom yang diinginkan secara dinamis columns_input = st.text_input( "Tentukan Nama Kolom (pisahkan dengan koma):", value="Date, Invoice_Number, Description, Quantity, Total_Amount" ) # Upload File PDF uploaded_file = st.file_uploader("Upload File PDF Anda", type=["pdf"]) with col2: st.subheader("2. Hasil Data Spreadsheet") if uploaded_file and GEMINI_API_KEY: with st.spinner("Membaca file PDF..."): pdf_text = extract_text_from_pdf(uploaded_file) if not pdf_text.strip(): st.error("Gagal membaca teks dari PDF. Pastikan file PDF bukan hasil foto/scan gambar buram.") else: with st.spinner("AI sedang mengekstrak dan menstrukturkan data..."): try: # Mengirim ke AI ai_response = process_text_with_ai(pdf_text, columns_input) # Mengubah string JSON dari AI menjadi List/Dataframe Pandas data_json = json.loads(ai_response) df = pd.DataFrame(data_json) # Menampilkan Tabel di Web st.success("🎉 Data berhasil diekstrak dengan sukses!") st.dataframe(df, use_container_width=True) # Tombol Download Excel / CSV st.subheader("3. Unduh File") col_dl1, col_dl2 = st.columns(2) # Export ke Excel buffer_excel = io.BytesIO() with pd.ExcelWriter(buffer_excel, engine='openpyxl') as writer: df.to_excel(writer, index=False, sheet_name='Data Ekstraksi') with col_dl1: st.download_button( label="📥 Download format EXCEL (.xlsx)", data=buffer_excel.getvalue(), file_name="pdf_extracted_data.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) # Export ke CSV csv_data = df.to_csv(index=False).encode('utf-8') with col_dl2: st.download_button( label="📄 Download format CSV (.csv)", data=csv_data, file_name="pdf_extracted_data.csv", mime="text/csv" ) except Exception as e: st.error(f"Terjadi kesalahan saat memproses data: {e}") st.info("Tips: Coba periksa apakah nama kolom yang Anda minta sudah sesuai dengan isi PDF.")