Strange difference in number of characters after copying text from tables

alfons11 · March 29, 2025, 11:14pm

I have a script set up to copy entire lines containing a certain text sequence from a Writer document (file1), copy them to another file (file2), and then remove them from the original (the reduced original file is then file3). I would expect the number of characters in file1 to be equal to the sum of the characters in files2 and 3, but that is not the case. File1 has 107,638 characters, file2 has 28,255 characters, file3 has 82,313 characters. Please, why? I need to assure that no data was lost. I see no differences if I try to compare the files visually. If I try to compare the number of carracters via Python, two files (with many rows containing text) are always marked as “empty” (0 words) - no matter what I do. Thank you for your help.

LO 24.8.5.2 (X86_64)

CPU threads: 4; OS: Windows 10.0 Build 19045; UI render: Skia/Raster; VCL: win
Locale: cs-CZ (cs_CZ); UI: cs-CZ

(Here is the full Python code:

import uno
import os
from com.sun.star.beans import PropertyValue

print("✅ Script loaded.")
print("📁 Running from file:", os.path.abspath(__file__))

# Defining sequences to search for (case insensitive)
sequences = ["oo.", "nn.", "mm.", "ll.", "kk.", "jj.", "ii.", "hh.", "gg.", "ff.", "ee.", "dd.", "cc.", "bb."]

def prop(name, value):
    p = PropertyValue()
    p.Name = name
    p.Value = value
    return p

def connect_to_libreoffice():
    print("🔌 Connecting to LibreOffice...")
    local_ctx = uno.getComponentContext()
    resolver = local_ctx.ServiceManager.createInstanceWithContext(
        "com.sun.star.bridge.UnoUrlResolver", local_ctx)
    ctx = resolver.resolve(
        "uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
    return ctx

def load_doc(ctx, url):
    print("📂 Opening document...")
    desktop = ctx.ServiceManager.createInstanceWithContext(
        "com.sun.star.frame.Desktop", ctx)
    return desktop.loadComponentFromURL(url, "_blank", 0, (prop("Hidden", True),))

def save_doc(doc, url):
    print(f"💾 Saving: {url}")
    doc.storeToURL(url, (
        prop("FilterName", "writer8"),
        prop("Overwrite", True),
    ))

def extract_matching_rows(doc):
    print("🔍 Searching for rows containing specific sequences...")

    tables = doc.TextTables
    matched_rows = {seq: [] for seq in sequences}

    for name in tables.ElementNames:
        tbl = tables.getByName(name)
        row_count = tbl.Rows.Count
        col_count = tbl.Columns.Count

        for r in range(row_count):
            found_seq = None
            for seq in sequences:
                for c in range(col_count):
                    try:
                        cell = tbl.getCellByPosition(c, r)
                        if seq in cell.String.lower():
                            found_seq = seq
                            break
                    except:
                        continue
                if found_seq:
                    matched_rows[found_seq].append((tbl, r, col_count))
                    break  # stop checking further once a sequence is found

    print(f"🔎 Found rows for each sequence.")
    return matched_rows

def copy_cell_content_with_format(src_cell, dst_cell):
    try:
        src_cursor = src_cell.createTextCursor()
        dst_cursor = dst_cell.createTextCursor()

        for i in range(len(src_cell.String)):
            src_cursor.gotoStart(False)
            src_cursor.goRight(i, False)
            src_cursor.goRight(1, True)

            char = src_cursor.getString()
            if not char:
                continue

            # Set format before inserting the character
            for prop_name in [
                "CharColor", "CharWeight", "CharPosture", "CharFontName",
                "CharUnderline", "CharStrikeout", "CharBackColor"
            ]:
                try:
                    value = src_cursor.getPropertyValue(prop_name)
                    dst_cursor.setPropertyValue(prop_name, value)
                except:
                    pass

            dst_cell.insertString(dst_cursor, char, False)
            dst_cursor.gotoEnd(False)

    except Exception as e:
        print(f"⚠️ Error while copying by characters: {e}")

def main():
    print("🚀 Running main()...")

    input_file = "file:///C:/Users/Alfons1%20CSc/Documents/zz%20skripty%20atd/dokument.odt"
    output_file = "file:///C:/Users/Alfons1%20CSc/Documents/zz%20skripty%20atd/radky_se_sekvencemi.odt"
    modified_file = "file:///C:/Users/Alfons1%20CSc/Documents/zz%20skripty%20atd/upraveny.odt"

    ctx = connect_to_libreoffice()
    doc = load_doc(ctx, input_file)
    matched_rows = extract_matching_rows(doc)

    desktop = ctx.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
    new_doc = desktop.loadComponentFromURL("private:factory/swriter", "_blank", 0, ())
    text = new_doc.Text
    cursor = text.createTextCursor()

    if not any(matched_rows.values()):
        text.insertString(cursor, "❌ No rows found with the matching sequences.", False)
        save_doc(new_doc, output_file)
        return

    # Creating tables for each sequence
    for seq in sequences:
        rows = matched_rows[seq]
        if rows:
            text.insertString(cursor, f"Sequence: {seq}\n", False)

            new_table = new_doc.createInstance("com.sun.star.text.TextTable")
            new_table.initialize(len(rows), rows[0][2])  # Number of rows and columns in the new table
            text.insertTextContent(cursor, new_table, False)

            for i, (tbl, row_index, _) in enumerate(rows):
                for c in range(rows[0][2]):
                    try:
                        src_cell = tbl.getCellByPosition(c, row_index)
                        dst_cell = new_table.getCellByPosition(c, i)
                        copy_cell_content_with_format(src_cell, dst_cell)
                    except Exception as e:
                        print(f"⚠️ Error while copying cell ({row_index},{c}): {e}")

            text.insertControlCharacter(cursor,
                uno.getConstantByName("com.sun.star.text.ControlCharacter.PARAGRAPH_BREAK"), False)

    save_doc(new_doc, output_file)

    print("✂️ Deleting rows from the original document...")
    for seq, rows in matched_rows.items():
        for tbl, row_index, _ in reversed(rows):
            try:
                tbl.Rows.removeByIndex(row_index, 1)  # Deleting rows in the original file
            except:
                pass

    save_doc(doc, modified_file)

    print("✅ DONE.")

if __name__ == "__main__":
    main() )

Formatting fixed by ajlittoz

fpy · March 31, 2025, 6:05am

try to unzip your .odt files to spot where the difference comes from.
might be some image encoding difference, or some unused styles stripped off …