I have a script set up to copy entire lines containing a certain text sequence from a Writer document (file1), copy them to another file (file2), and then remove them from the original (the reduced original file is then file3). I would expect the number of characters in file1 to be equal to the sum of the characters in files2 and 3, but that is not the case. File1 has 107,638 characters, file2 has 28,255 characters, file3 has 82,313 characters. Please, why? I need to assure that no data was lost. I see no differences if I try to compare the files visually. If I try to compare the number of carracters via Python, two files (with many rows containing text) are always marked as “empty” (0 words) - no matter what I do. Thank you for your help.
LO 24.8.5.2 (X86_64)
CPU threads: 4; OS: Windows 10.0 Build 19045; UI render: Skia/Raster; VCL: win
Locale: cs-CZ (cs_CZ); UI: cs-CZ
(Here is the full Python code:
import uno
import os
from com.sun.star.beans import PropertyValue
print("✅ Script loaded.")
print("📁 Running from file:", os.path.abspath(__file__))
# Defining sequences to search for (case insensitive)
sequences = ["oo.", "nn.", "mm.", "ll.", "kk.", "jj.", "ii.", "hh.", "gg.", "ff.", "ee.", "dd.", "cc.", "bb."]
def prop(name, value):
p = PropertyValue()
p.Name = name
p.Value = value
return p
def connect_to_libreoffice():
print("🔌 Connecting to LibreOffice...")
local_ctx = uno.getComponentContext()
resolver = local_ctx.ServiceManager.createInstanceWithContext(
"com.sun.star.bridge.UnoUrlResolver", local_ctx)
ctx = resolver.resolve(
"uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
return ctx
def load_doc(ctx, url):
print("📂 Opening document...")
desktop = ctx.ServiceManager.createInstanceWithContext(
"com.sun.star.frame.Desktop", ctx)
return desktop.loadComponentFromURL(url, "_blank", 0, (prop("Hidden", True),))
def save_doc(doc, url):
print(f"💾 Saving: {url}")
doc.storeToURL(url, (
prop("FilterName", "writer8"),
prop("Overwrite", True),
))
def extract_matching_rows(doc):
print("🔍 Searching for rows containing specific sequences...")
tables = doc.TextTables
matched_rows = {seq: [] for seq in sequences}
for name in tables.ElementNames:
tbl = tables.getByName(name)
row_count = tbl.Rows.Count
col_count = tbl.Columns.Count
for r in range(row_count):
found_seq = None
for seq in sequences:
for c in range(col_count):
try:
cell = tbl.getCellByPosition(c, r)
if seq in cell.String.lower():
found_seq = seq
break
except:
continue
if found_seq:
matched_rows[found_seq].append((tbl, r, col_count))
break # stop checking further once a sequence is found
print(f"🔎 Found rows for each sequence.")
return matched_rows
def copy_cell_content_with_format(src_cell, dst_cell):
try:
src_cursor = src_cell.createTextCursor()
dst_cursor = dst_cell.createTextCursor()
for i in range(len(src_cell.String)):
src_cursor.gotoStart(False)
src_cursor.goRight(i, False)
src_cursor.goRight(1, True)
char = src_cursor.getString()
if not char:
continue
# Set format before inserting the character
for prop_name in [
"CharColor", "CharWeight", "CharPosture", "CharFontName",
"CharUnderline", "CharStrikeout", "CharBackColor"
]:
try:
value = src_cursor.getPropertyValue(prop_name)
dst_cursor.setPropertyValue(prop_name, value)
except:
pass
dst_cell.insertString(dst_cursor, char, False)
dst_cursor.gotoEnd(False)
except Exception as e:
print(f"⚠️ Error while copying by characters: {e}")
def main():
print("🚀 Running main()...")
input_file = "file:///C:/Users/Alfons1%20CSc/Documents/zz%20skripty%20atd/dokument.odt"
output_file = "file:///C:/Users/Alfons1%20CSc/Documents/zz%20skripty%20atd/radky_se_sekvencemi.odt"
modified_file = "file:///C:/Users/Alfons1%20CSc/Documents/zz%20skripty%20atd/upraveny.odt"
ctx = connect_to_libreoffice()
doc = load_doc(ctx, input_file)
matched_rows = extract_matching_rows(doc)
desktop = ctx.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
new_doc = desktop.loadComponentFromURL("private:factory/swriter", "_blank", 0, ())
text = new_doc.Text
cursor = text.createTextCursor()
if not any(matched_rows.values()):
text.insertString(cursor, "❌ No rows found with the matching sequences.", False)
save_doc(new_doc, output_file)
return
# Creating tables for each sequence
for seq in sequences:
rows = matched_rows[seq]
if rows:
text.insertString(cursor, f"Sequence: {seq}\n", False)
new_table = new_doc.createInstance("com.sun.star.text.TextTable")
new_table.initialize(len(rows), rows[0][2]) # Number of rows and columns in the new table
text.insertTextContent(cursor, new_table, False)
for i, (tbl, row_index, _) in enumerate(rows):
for c in range(rows[0][2]):
try:
src_cell = tbl.getCellByPosition(c, row_index)
dst_cell = new_table.getCellByPosition(c, i)
copy_cell_content_with_format(src_cell, dst_cell)
except Exception as e:
print(f"⚠️ Error while copying cell ({row_index},{c}): {e}")
text.insertControlCharacter(cursor,
uno.getConstantByName("com.sun.star.text.ControlCharacter.PARAGRAPH_BREAK"), False)
save_doc(new_doc, output_file)
print("✂️ Deleting rows from the original document...")
for seq, rows in matched_rows.items():
for tbl, row_index, _ in reversed(rows):
try:
tbl.Rows.removeByIndex(row_index, 1) # Deleting rows in the original file
except:
pass
save_doc(doc, modified_file)
print("✅ DONE.")
if __name__ == "__main__":
main() )
Formatting fixed by ajlittoz