import streamlit as st
import pypdf
import google.generativeai as genai
import pandas as pd
import json
import io

# ==========================================
# 1. KONFIGURASI DAN SETUP API
# ==========================================
st.set_page_config(page_title="PDF to Sheet Converter", page_icon="📊", layout="wide")

# Mengambil API Key dari Replit Secrets (Environment Variables)
# Pastikan Anda sudah menambahkan GEMINI_API_KEY di menu "Secrets" Replit Anda
GEMINI_API_KEY = st.secrets.get("sk-27b0045955c84bd69458791996106eb0", "")

if GEMINI_API_KEY:
    genai.configure(api_key=sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0sk-27b0045955c84bd69458791996106eb0)
else:
    st.warning("⚠️ Silakan masukkan GEMINI_API_KEY Anda di menu 'Secrets' Replit untuk mengaktifkan AI.")

# ==========================================
# 2. FUNGSI EKSTRAKSI PDF & AI
# ==========================================
def extract_text_from_pdf(uploaded_file):
    """Membaca file PDF dan mengekstrak seluruh teksnya."""
    pdf_reader = pypdf.PdfReader(uploaded_file)
    full_text = ""
    for page in pdf_reader.pages:
        text = page.extract_text()
        if text:
            full_text += text + "\n"
    return full_text

def process_text_with_ai(pdf_text, column_headers):
    """Mengirim teks ke Gemini dengan Prompt Berintegritas & Terstruktur."""
    model = genai.GenerativeModel("gemini-1.5-flash")
    
    # SYSTEM PROMPT YANG SUDAH DIOPTIMALKAN
    prompt = f"""
    You are an expert AI Data Extraction Assistant specializing in OCR and document processing. 
    Your task is to extract the following text from a PDF document and convert it into a highly structured JSON array format ready for spreadsheet consumption.

    You must strictly adhere to the following data integrity and structural guidelines:

    1. DATA INTEGRITY (ANTI-HALLUCINATION):
    - Extract data EXACTLY as it appears in the source text. Never invent, assume, extrapolate, or guess any information.
    - If a data point is missing, incomplete, or unreadable, leave it as null or an empty string "". Do not try to complete or predict the data.

    2. DATA TYPES & STRUCTURE:
    - Ensure each column maintains a consistent data type.
    - Numerical Columns (Price, Quantity, Total, etc.): Strip away all currency symbols (e.g., $, Rp, €, USD) and non-numeric characters except for decimal points. The output must be clean numbers (Integer or Float) so they can be calculated immediately in a spreadsheet.
    - Date Columns: Standardize all date formats into the ISO standard format: YYYY-MM-DD.

    3. OUTPUT FORMAT:
    - Return ONLY a valid JSON Array of Objects. Do not include any conversational text, introductions, explanations, or markdown code block wrappers (like ```json).
    - The required schema/keys for extraction are: {column_headers}

    Here is the extracted text from the PDF:
    {pdf_text}
    """
    
    # Mengatur agar AI wajib merespon dalam format JSON bersih
    response = model.generate_content(
        prompt,
        generation_config={"response_mime_type": "application/json"}
    )
    return response.text

# ==========================================
# 3. TAMPILAN APLIKASI (UI)
# ==========================================
st.title("📊 Smart PDF to Sheet Converter")
st.write("Ekstrak data PDF Anda secara cerdas menjadi file Excel/CSV dengan integritas data tinggi.")

# Layout Kolom Input
col1, col2 = st.columns([1, 2])

with col1:
    st.subheader("1. Pengaturan Ekstraksi")
    # Input kolom yang diinginkan secara dinamis
    columns_input = st.text_input(
        "Tentukan Nama Kolom (pisahkan dengan koma):",
        value="Date, Invoice_Number, Description, Quantity, Total_Amount"
    )
    
    # Upload File PDF
    uploaded_file = st.file_uploader("Upload File PDF Anda", type=["pdf"])

with col2:
    st.subheader("2. Hasil Data Spreadsheet")
    
    if uploaded_file and GEMINI_API_KEY:
        with st.spinner("Membaca file PDF..."):
            pdf_text = extract_text_from_pdf(uploaded_file)
            
        if not pdf_text.strip():
            st.error("Gagal membaca teks dari PDF. Pastikan file PDF bukan hasil foto/scan gambar buram.")
        else:
            with st.spinner("AI sedang mengekstrak dan menstrukturkan data..."):
                try:
                    # Mengirim ke AI
                    ai_response = process_text_with_ai(pdf_text, columns_input)
                    
                    # Mengubah string JSON dari AI menjadi List/Dataframe Pandas
                    data_json = json.loads(ai_response)
                    df = pd.DataFrame(data_json)
                    
                    # Menampilkan Tabel di Web
                    st.success("🎉 Data berhasil diekstrak dengan sukses!")
                    st.dataframe(df, use_container_width=True)
                    
                    # Tombol Download Excel / CSV
                    st.subheader("3. Unduh File")
                    col_dl1, col_dl2 = st.columns(2)
                    
                    # Export ke Excel
                    buffer_excel = io.BytesIO()
                    with pd.ExcelWriter(buffer_excel, engine='openpyxl') as writer:
                        df.to_excel(writer, index=False, sheet_name='Data Ekstraksi')
                    
                    with col_dl1:
                        st.download_button(
                            label="📥 Download format EXCEL (.xlsx)",
                            data=buffer_excel.getvalue(),
                            file_name="pdf_extracted_data.xlsx",
                            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
                        )
                        
                    # Export ke CSV
                    csv_data = df.to_csv(index=False).encode('utf-8')
                    with col_dl2:
                        st.download_button(
                            label="📄 Download format CSV (.csv)",
                            data=csv_data,
                            file_name="pdf_extracted_data.csv",
                            mime="text/csv"
                        )
                        
                except Exception as e:
                    st.error(f"Terjadi kesalahan saat memproses data: {e}")
                    st.info("Tips: Coba periksa apakah nama kolom yang Anda minta sudah sesuai dengan isi PDF.")