Is there a reasonable opportunity to replace the grid in a python program with a LO component? I use Adobe XFA Form fields and I want to automate the filling process using a spreadsheet. The only way I can currently fill documents is in Acrobat Reader and I use the fields directly but they can be converted to normal form fields as shown in the following example:
Code to add fields to pdf:
from PyPDFForm import PdfWrapper
from PyPDFForm import FormWrapper
new_form = PdfWrapper("Capital-gains-tax-schedule-2022.pdf").create_widget(
"text", "TFN_1", 1, 142, 605, width=41, height=17, max_length=3, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "TFN_2", 1, 198, 605, width=41, height=17, max_length=3, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "TFN_3", 1, 255, 605, width=41, height=17, max_length=3, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "ABN_1", 1, 204, 553, width=27, height=17, max_length=2, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "ABN_2", 1, 247, 553, width=41, height=17, max_length=3, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "ABN_3", 1, 303, 553, width=41, height=17, max_length=3, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "ABN_4", 1, 360, 553, width=41, height=17, max_length=3, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "Taxpayer", 1, 28, 514, width=537, height=17, max_length=38, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "Taxpayer2", 1, 28, 490, width=537, height=17, max_length=38, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1A", 1, 153, 420, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1K", 1, 380, 420, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1B", 1, 153, 390, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1L", 1, 380, 390, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1C", 1, 153, 360, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1M", 1, 380, 360, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1D", 1, 153, 330, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1N", 1, 380, 330, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1E", 1, 153, 300, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1O", 1, 380, 300, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1F", 1, 153, 271, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1P", 1, 380, 271, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1G", 1, 153, 241, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1H", 1, 153, 211, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1Q", 1, 380, 211, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "2I", 1, 153, 181, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1R", 1, 380, 181, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1S", 1, 153, 140, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "1J", 1, 153, 92, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "2A", 2, 380, 760, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "2B", 2, 380, 714, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "2C", 2, 380, 685, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "2D", 2, 380, 655, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "2E", 2, 380, 615, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "3A", 2, 380, 544, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "3B", 2, 380, 520, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "4A", 2, 380, 429, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "5A", 2, 380, 371, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "5B", 2, 380, 347, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "5C", 2, 380, 323, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "5D", 2, 380, 295, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "6A", 2, 380, 232, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"radio", "7A", 3, [312,446,554], [759,759,759], size=16, border_width=0).create_widget(
"text", "7B", 3, 351, 693, width=27, height=17, max_length=2, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "7C", 3, 351, 669, width=27, height=17, max_length=2, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "7D", 3, 360, 639, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "7E", 3, 360, 609, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "7EL", 3, 554, 609, width=13, height=17, max_length=1, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "7F", 3, 351, 540, width=55, height=17, max_length=4, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "7G", 3, 360, 510, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "7GL", 3, 554, 510, width=13, height=17, max_length=1, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "8A", 3, 362, 456, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "8AL", 3, 554, 456, width=13, height=17, max_length=1, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "8B", 3, 362, 432, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "8C", 3, 362, 408, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "8D", 3, 362, 384, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "8E", 3, 362, 360, width=167, height=17, max_length=11, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "Day", 4, 401, 553, width=27, height=17, max_length=2, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "Month", 4, 444, 553, width=27, height=17, max_length=2, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "Year", 4, 486, 553, width=55, height=17, max_length=4, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "Contact_Name", 4, 28, 519, width=537, height=17, max_length=38, comb=True, border_width=0, bg_color=(0, 0, 0, 0)).create_widget(
"text", "Contact_Number", 4, 28, 485, width=211, height=17, max_length=15, comb=True, border_width=0, bg_color=(0, 0, 0, 0)
)
with open("sample.pdf", "wb+") as output:
output.write(new_form.read())
"""
filled = FormWrapper("sample.pdf").fill(
{
"Date": "22052025",
},
)
with open("output.pdf", "wb+") as output:
output.write(filled.read())
I’m not aware of any FOSS that can extract the form fields so I’ve developed a process to extract the box coordinates and display then on the pdf with a grid so fields can be identified and attributes added before generating the form fields creating a pdf updatable in any pdf reader.
Coding the form is a tedious error-prone manual effort. DataMap points are currently filtered before points.csv is written to load into v[isual]pdf.py. A right-mouse-click on the pdf brings up the [proof of concept] grid and selected record is highlighted. I understand all boxes are a similar size. Optionally enter the field length and number of boxes or select a template for predefined field characteristics like grouping and finessed spacing.
Would it be reasonable to replace the grid with a stripped down calc sheet or similar to filter and sort the data, and enter predefined template names, and other required data? If so, what is an outline of the process?
Useful Grid features:
- load and save
- improve atrocious data entry
- context menu to toggle the row/column coordinate sort
- enter Template value from a dropdown list
- enter an
>
to retain currently selected record allowing another record to be selected for field width calculation - enter either number of boxes or field length to calculate the other
- launch process to generate updated pdf form with fields
This is running under Windows 11 and all current files and code are include in this and the next post.
Extract DataMap from pdf:
datamap.py
import os
import sys
import csv
import pikepdf
from openpyxl import Workbook
from openpyxl.styles import Font
from openpyxl.utils import get_column_letter
class Transform:
"""
Handles coordinate transformation from PDF native coordinates to Acrobat-style coordinates.
"""
def __init__(self, matrix):
"""
Initialize the transformation matrix.
Args:
matrix (list): A list of six floats representing the transformation matrix.
"""
self.a, self.b, self.c, self.d, self.e, self.f = matrix
def to_acrobat(self, x, y):
"""
Convert (x, y) coordinates using the transformation matrix to Acrobat coordinates.
Args:
x (float): The x-coordinate in PDF space.
y (float): The y-coordinate in PDF space.
Returns:
tuple: Transformed (x, y) coordinates in Acrobat space.
"""
x_new = x - self.e
y_new = - (y + self.f)
return (x_new, y_new)
class PDFDatamapExtractor:
"""
Extracts mapping data from a PDF file's InDesign PieceInfo dictionary.
"""
def __init__(self, pdf_path, target_key):
"""
Initialize the extractor with a PDF file path and the target dictionary key.
Args:
pdf_path (str): Path to the PDF file.
target_key (str): The key to extract from the InDesign dictionary.
"""
self.pdf_path = pdf_path
self.target_key = target_key
def extract_pageitemuid_maps(self):
"""
Extracts mapping data from all pages in the PDF.
Returns:
list: A list of rows, each representing extracted attributes for a page item.
"""
rows = []
with pikepdf.open(self.pdf_path) as pdf:
all_pages = list(pdf.pages)
for i, page in enumerate(all_pages):
piece_info = page.get('/PieceInfo', None)
if piece_info and '/InDesign' in piece_info:
indesign = piece_info['/InDesign']
if self.target_key in indesign:
for k, v in indesign[self.target_key].items():
item = int(k.lstrip('/'))
attributes = [float(x) for x in v]
rows.append([i + 1, item] + attributes)
return rows
class DatamapProcessor:
"""
Processes extracted PDF data: sorts, transforms coordinates, and adds computed columns.
"""
def __init__(self, transform):
"""
Initialize the processor with a coordinate transformation object.
Args:
transform (Transform): An instance of the Transform class.
"""
self.transform = transform
def process_rows(self, rows):
"""
Sorts rows and computes Acrobat coordinates and dimensions for each row.
Args:
rows (list): List of extracted raw rows.
Returns:
list: Processed rows with additional coordinate and size columns.
"""
all_rows_sorted = sorted(rows, key=lambda row: (row[0], row[1]))
final_rows = []
for row in all_rows_sorted:
base_row = row + [''] * (15 - len(row)) if len(row) < 15 else row[:15]
try:
x1 = float(base_row[5])
y1 = float(base_row[6])
x2 = float(base_row[7])
y2 = float(base_row[8])
pdf_left, pdf_top = self.transform.to_acrobat(x1, y1)
pdf_right, pdf_bottom = self.transform.to_acrobat(x2, y2)
pdf_left = round(pdf_left, 3)
pdf_top = round(pdf_top, 3)
pdf_right = round(pdf_right, 3)
pdf_bottom = round(pdf_bottom, 3)
height = round(pdf_top - pdf_bottom, 3)
width = round(pdf_right - pdf_left, 3)
pdf_coords = [pdf_left, pdf_top, pdf_right, pdf_bottom, height, width]
except (ValueError, IndexError, TypeError):
pdf_coords = ['', '', '', '', '', '']
final_rows.append(base_row + pdf_coords)
return final_rows
def sort_final_rows(self, final_rows):
"""
Sorts processed rows by page, descending PdfBottom, and PdfLeft.
Args:
final_rows (list): List of processed rows.
Returns:
list: Sorted rows.
"""
def sort_key(row):
try:
page = int(row[0])
except (ValueError, TypeError):
page = 0
try:
pdf_bottom = float(row[18])
except (ValueError, IndexError, TypeError):
pdf_bottom = float('-inf')
try:
pdf_left = float(row[15])
except (ValueError, IndexError, TypeError):
pdf_left = float('inf')
return (page, -pdf_bottom, pdf_left)
return sorted(final_rows, key=sort_key)
def add_gap_column(self, sorted_rows):
"""
Adds a column representing the gap between adjacent items on the same page.
Args:
sorted_rows (list): List of sorted rows.
Returns:
list: Rows with an additional gap column.
"""
rows_with_gap = []
for idx, row in enumerate(sorted_rows):
if idx + 1 < len(sorted_rows):
curr_page = row[0]
next_row = sorted_rows[idx + 1]
next_page = next_row[0]
try:
curr_pdf_right = float(row[17])
next_pdf_left = float(next_row[15])
except (ValueError, IndexError, TypeError):
gap = ''
else:
if curr_page == next_page:
gap = round(next_pdf_left - curr_pdf_right, 3)
else:
gap = ''
else:
gap = ''
rows_with_gap.append(row + [gap])
return rows_with_gap
class SpreadsheetWriter:
"""
Writes processed data rows to an Excel spreadsheet with headers and formatting.
"""
def __init__(self, header, extra_columns):
"""
Initialize the writer with header and extra column names.
Args:
header (list): List of main column headers.
extra_columns (list): List of additional columns to append.
"""
self.header = header
self.extra_columns = extra_columns
def write(self, rows, output_path):
"""
Write the data rows to an Excel file, appending extra columns and formatting headers.
Args:
rows (list): List of rows to write.
output_path (str): Output Excel file path.
"""
wb = Workbook()
ws = wb.active
ws.append(self.header)
for row in rows:
ws.append(row)
last_col = ws.max_column
for idx, col_name in enumerate(self.extra_columns, start=last_col + 1):
ws.cell(row=1, column=idx, value=col_name)
for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=last_col + 1, max_col=last_col + len(self.extra_columns)):
for cell in row:
cell.value = ''
for cell in ws[1]:
cell.font = Font(bold=True)
ws.freeze_panes = 'C2'
max_col_letter = get_column_letter(ws.max_column)
max_row = ws.max_row
ws.auto_filter.ref = f"A1:{max_col_letter}{max_row}"
wb.save(output_path)
class PDFDataMapController:
"""
Orchestrates the extraction, processing, and writing of PDF data map information.
Also exports selected columns to CSV.
"""
def __init__(self, pdf_path):
"""
Initialize the controller with the PDF path and configuration.
Args:
pdf_path (str): Path to the input PDF file.
"""
self.pdf_path = pdf_path
self.header = [
'Page', 'Rec', 'f3', 'ID', 'Type', 'x1', 'y1', 'x2', 'y2',
'r', 'g', 'b', 'a', 'f14', 'f15',
'xLeft', 'yTop', 'xRight', 'yBottom',
'Height', 'Width', 'Gap'
]
self.extra_columns = [
'Template', 'FName', 'FBWidth', 'Length', 'widget_type', 'name',
'page_number', 'x', 'y', 'width', 'height', 'max_length', 'comb', 'etc'
]
self.csv_columns = [
'Page', 'Rec', 'ID', 'Type', 'xLeft', 'yTop', 'xRight', 'yBottom',
'Height', 'Width','Gap', 'Template', 'FName', 'FBWidth', 'Length'
]
self.matrix = [1.0, 0.0, 0.0, 1.0, -297.638, -420.945]
self.transform = Transform(self.matrix)
self.target_key = '/PageItemUIDToLocationDataMap'
def run(self):
"""
Run the full data extraction, processing, Excel writing, and CSV export workflow.
"""
extractor = PDFDatamapExtractor(self.pdf_path, self.target_key)
raw_rows = extractor.extract_pageitemuid_maps()
processor = DatamapProcessor(self.transform)
processed_rows = processor.process_rows(raw_rows)
sorted_rows = processor.sort_final_rows(processed_rows)
rows_with_gap = processor.add_gap_column(sorted_rows)
base, _ = os.path.splitext(self.pdf_path)
spreadsheet_name = f"{base}_DataMap.xlsx"
writer = SpreadsheetWriter(self.header, self.extra_columns)
writer.write(rows_with_gap, spreadsheet_name)
print(f"Saved: {spreadsheet_name}.")
# --- FILTER: Only rows where Type == 4.0 and round(Height) == 17 ---
filtered_rows = [
row for row in rows_with_gap
if len(row) > 19
and str(row[4]).strip() == '4.0'
and row[19] not in ('', None)
and round(float(row[19])) == 17
]
self.export_csv(filtered_rows, base)
def export_csv(self, rows, base):
"""
Export selected columns to a CSV file.
Args:
rows (list): List of all processed rows (with gap and extra columns).
base (str): Base filename (without extension).
"""
# Combine header and extra_columns for full column list
full_columns = self.header + self.extra_columns
col_indices = []
for col in self.csv_columns:
if col in full_columns:
col_indices.append(full_columns.index(col))
else:
# If column not found, append -1 to indicate missing
col_indices.append(-1)
# csv_filename = f"{base}_DataMap.csv"
csv_filename = f"points.csv"
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(self.csv_columns)
for row in rows:
csv_row = []
for idx in col_indices:
if idx >= 0 and idx < len(row):
csv_row.append(row[idx])
else:
csv_row.append('')
writer.writerow(csv_row)
print(f"Saved: {csv_filename}.")
if __name__ == "__main__":
"""
Entry point for running the PDF data map extraction and export process.
"""
if len(sys.argv) < 2:
print("Usage: python script.py <inputfile.pdf>")
sys.exit(1)
pdf_path = sys.argv[1]
if not os.path.isfile(pdf_path):
print(f"Error: file '{pdf_path}' not found.")
sys.exit(1)
controller = PDFDataMapController(pdf_path)
controller.run()