import { getDocument } from 'pdfjs-dist';
import 'pdfjs-dist/build/pdf.worker.mjs'; // Ensure worker is bundled
import { TextItem } from 'pdfjs-dist/types/src/display/api';
import { Transaction } from '../Transaction';
import { clean, cleanAmount, extractValue } from './helpUtils';

// Extend the header check to return the document type.
enum MercuryDocType {
  Type1,
  Type2,
  Unknown,
}

// Set extraction mapping based on document type.
type ExtractionMapping = {
  date: number;
  desc1: number[];
  additionalDesc: number;
  amount: number;
};

const extractionMappingConfig = {
  Type1: {
    date: 0,
    desc1: [2],
    additionalDesc: 6,
    amount: 7,
  },
  Type2: {
    date: 0,
    desc1: [6, 7, 8],
    additionalDesc: 12,
    amount: 13,
  },
  Unknown: {
    date: 0,
    desc1: [2],
    additionalDesc: 6,
    amount: 7,
  },
};

const determineDocType = (row: TextItem[]): MercuryDocType => {
  if (row.length > 0 && row[0].str.replace(/\s+/g, '') === 'Date(UTC)') return MercuryDocType.Type1;
  if (
    row.length > 4 &&
    row[0].str.replace(/\s+/g, '') === 'Date' &&
    row[2].str.replace(/\s+/g, '') === '(' &&
    row[3].str.replace(/\s+/g, '') === 'UTC' &&
    row[4].str.replace(/\s+/g, '') === ')'
  )
    return MercuryDocType.Type2;
  return MercuryDocType.Unknown;
};

const parseTransactionsFromRows = (tokens: TextItem[]): Transaction[] => {
  // Group tokens into row groups based on EOL.
  let rowTokens: TextItem[][] = [];
  let currentRow: TextItem[] = [];
  for (const token of tokens) {
    currentRow.push(token);
    if (token.hasEOL) {
      rowTokens.push(currentRow);
      currentRow = [];
    }
  }

  // Identify header row and document type.
  let headerRow: TextItem[] | null = null;
  let docType: MercuryDocType = MercuryDocType.Unknown;
  for (const row of rowTokens) {
    docType = determineDocType(row);
    if (docType !== MercuryDocType.Unknown) {
      headerRow = [...row];
      break;
    }
  }
  if (!headerRow) return [];

  // Skip rows until valid transaction rows are found.
  const monthRegex = /^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b/;
  while (rowTokens.length > 0 && !monthRegex.test(rowTokens[0][0].str.trim())) {
    rowTokens.shift();
  }

  // Load extraction mapping from external config
  const mappingKey =
    docType === MercuryDocType.Type1
      ? 'Type1'
      : docType === MercuryDocType.Type2
      ? 'Type2'
      : 'Unknown';
  const mapping: ExtractionMapping = extractionMappingConfig[mappingKey];

  // Use the header row to determine column boundaries.
  const headerX = headerRow.map((token) => token.transform[4]).sort((a, b) => a - b);
  const boundaries: number[] = [];
  for (let i = 0; i < headerX.length - 1; i++) {
    boundaries.push((headerX[i] + headerX[i + 1]) / 2);
  }
  boundaries.push(Infinity);

  // For each transaction row, group tokens into columns based on token.transform[4] and boundaries.
  const transactions: Transaction[] = [];
  let lastDate = '';
  for (const row of rowTokens) {
    const sorted = row.slice().sort((a, b) => a.transform[4] - b.transform[4]);
    const numCols = headerX.length;
    const columns: string[] = new Array(numCols).fill('');
    for (const token of sorted) {
      const x = token.transform[4];
      let colIndex = 0;
      while (colIndex < boundaries.length && x > boundaries[colIndex]) {
        colIndex++;
      }
      columns[colIndex] += token.str + ' ';
    }
    const dateVal = clean(columns[mapping.date]) || lastDate;

    const amountVal = cleanAmount(columns[mapping.amount]); // convert cleaned amount to number
    const additionalDesc = clean(columns[mapping.additionalDesc]);

    const description =
      extractValue(columns, mapping.desc1) + (additionalDesc ? ' | ' + additionalDesc : '');

    transactions.push({
      date: dateVal, // first column as date
      description,
      amount: Math.abs(amountVal), //Change the sign of the amount, as plaid expects so.
      direction: amountVal > 0 ? 'CREDIT' : 'DEBIT',
    } as Transaction);
    lastDate = dateVal;
  }
  return transactions;
};

const transform = (transactions: Transaction[], year: number) => {
  return transactions.map((transaction) => {
    return {
      ...transaction,
      date: transaction.date + ', ' + year,
    };
  });
};

const processMercuryFile = async (files: File[], year: number): Promise<Transaction[]> => {
  const extractTransactions = (file: File): Promise<Transaction[]> =>
    new Promise<Transaction[]>((resolve, reject) => {
      const reader = new FileReader();
      reader.onload = async () => {
        const typedArray = new Uint8Array(reader.result as ArrayBuffer);
        const pdf = await getDocument({ data: typedArray }).promise;

        let allPageTransactions: Transaction[] = [];
        for (let i = 1; i <= pdf.numPages; i++) {
          const page = await pdf.getPage(i);
          const textContent = await page.getTextContent();
          const items = textContent.items as TextItem[];
          // Pass the token array directly.
          const pageTransactions = parseTransactionsFromRows(items);

          allPageTransactions.push(...pageTransactions);
        }
        resolve(allPageTransactions);
      };
      reader.onerror = reject;
      reader.readAsArrayBuffer(file);
    });

  const allTransactions: Transaction[] = [];
  for (const file of files) {
    const transactions = await extractTransactions(file);
    if (!transactions || transactions.length === 0) {
      throw new Error('No transactions found in the file ' + file.name);
    }
    allTransactions.push(...transactions);
  }

  return transform(allTransactions, year);
};

export default processMercuryFile;
