import pandas as pd
from datetime import datetime

# Path to the Excel file
excel_path = "D:/CC_Finance_2025.xlsx"

# List all tabs to combine (e.g., Jan-Dec)
sheet_names = pd.ExcelFile(excel_path).sheet_names

# Collect normalized DataFrames
all_months = []

# Look for the header row: scan first 5 rows for "date" or "transactionid" in any cell (case-insensitive)
def is_header_row(row):
    return any("date" in str(cell).lower() or "transactionid" in str(cell).lower() for cell in row)

for sheet in sheet_names:
    # Read each tab, skip potential extra header rows (e.g., annual summary)
    # We'll read with header=0 first, then check if real data starts later
    df = pd.read_excel(excel_path, sheet_name=sheet, header=None)
    # Try to find the row with the actual column headers
    possible_header_rows = df.head(5).apply(is_header_row, axis=1)
    if possible_header_rows.any():
        header_row_idx = possible_header_rows.idxmax()
    else:
        header_row_idx = 0  # Default to first row if not found
    df.columns = df.iloc[header_row_idx]
    df = df.iloc[header_row_idx + 1:]
    df = df.reset_index(drop=True)
    
    # Remove any completely blank columns or rows
    df = df.dropna(axis=1, how='all').dropna(axis=0, how='all')
    # Standardize column names (strip whitespace, lower case)
    df.columns = [str(col).strip().lower().replace("\n", " ").replace("\r", "") for col in df.columns]
    
    # OPTIONAL: Add a column for month if needed
    df['source_month'] = sheet
    all_months.append(df)

# Combine all months into one DataFrame
full_df = pd.concat(all_months, ignore_index=True)

# Example: Normalize date columns with 2-digit years to 4-digit years
def fix_year(date_str):
    if pd.isnull(date_str):
        return date_str
    # Try to parse with 2-digit year fallback
    try:
        # If format is MM/DD/YY or DD/MM/YY, etc.
        dt = pd.to_datetime(date_str, errors='coerce', yearfirst=False)
        if pd.notnull(dt) and dt.year < 100:
            # Assume anything < 30 is 2000s, else 1900s
            if dt.year < 30:
                return dt.replace(year=dt.year + 2000)
            else:
                return dt.replace(year=dt.year + 1900)
        return dt
    except Exception:
        return date_str

if 'date' in full_df.columns:
    full_df['date'] = full_df['date'].apply(fix_year)

# Save the normalized file
full_df.to_csv("normalized_data.csv", index=False)

