├── requirements.txt ├── .gitattributes ├── install_reqs.bat ├── udf_to_pdf.bat ├── docx_to_udf.bat ├── udf_to_docx.bat ├── scanned_pdf_to_udf.bat ├── docx_to_udf.py ├── image_processor.py ├── main.py ├── README.md ├── table_processor.py ├── utils.py ├── scanned_pdf_to_udf.py ├── .gitignore ├── paragraph_processor.py ├── udf_to_md.py ├── Docs.md ├── udf_to_pdf.py └── udf_to_docx.py /requirements.txt: -------------------------------------------------------------------------------- 1 | python-docx 2 | PyMuPDF 3 | Pillow -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /install_reqs.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: This script installs the packages listed in requirements.txt 3 | 4 | REM Check if requirements file exists 5 | IF NOT EXIST requirements.txt ( 6 | echo Requirements file not found! 7 | exit /b 1 8 | ) 9 | 10 | REM Install the packages using pip 11 | pip install -r requirements.txt 12 | 13 | REM Check if the installation was successful 14 | IF %ERRORLEVEL% NEQ 0 ( 15 | echo Failed to install some packages. 16 | exit /b 1 17 | ) 18 | 19 | echo Packages installed successfully. 20 | pause -------------------------------------------------------------------------------- /udf_to_pdf.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: This script converts a UDF file to PDF format 3 | 4 | REM Check if a UDF file is provided by dragging 5 | IF "%~1"=="" ( 6 | echo Please drag a UDF file onto this script to convert it to PDF. 7 | pause 8 | exit /b 1 9 | ) 10 | 11 | REM Run the conversion 12 | python udf_to_pdf.py "%~1" 13 | 14 | REM Check if the conversion was successful 15 | IF %ERRORLEVEL% NEQ 0 ( 16 | echo Failed to convert UDF to PDF. 17 | pause 18 | exit /b 1 19 | ) 20 | 21 | echo UDF successfully converted to PDF. 22 | pause 23 | -------------------------------------------------------------------------------- /docx_to_udf.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: This script converts a DOCX file to UDF format 3 | 4 | REM Check if a DOCX file is provided by dragging 5 | IF "%~1"=="" ( 6 | echo Please drag a DOCX file onto this script to convert it to UDF. 7 | pause 8 | exit /b 1 9 | ) 10 | 11 | REM Run the conversion 12 | python docx_to_udf.py "%~1" 13 | 14 | REM Check if the conversion was successful 15 | IF %ERRORLEVEL% NEQ 0 ( 16 | echo Failed to convert DOCX to UDF. 17 | pause 18 | exit /b 1 19 | ) 20 | 21 | echo DOCX successfully converted to UDF. 22 | pause 23 | -------------------------------------------------------------------------------- /udf_to_docx.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: This script converts a UDF file to DOCX format 3 | 4 | REM Check if a UDF file is provided by dragging 5 | IF "%~1"=="" ( 6 | echo Please drag a UDF file onto this script to convert it to DOCX. 7 | pause 8 | exit /b 1 9 | ) 10 | 11 | REM Run the conversion 12 | python udf_to_docx.py "%~1" 13 | 14 | REM Check if the conversion was successful 15 | IF %ERRORLEVEL% NEQ 0 ( 16 | echo Failed to convert UDF to DOCX. 17 | pause 18 | exit /b 1 19 | ) 20 | 21 | echo UDF successfully converted to DOCX. 22 | pause 23 | -------------------------------------------------------------------------------- /scanned_pdf_to_udf.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | :: This script converts a scanned PDF file to UDF format 3 | 4 | REM Check if a PDF file is provided by dragging 5 | IF "%~1"=="" ( 6 | echo Please drag a PDF file onto this script to convert it to UDF. 7 | pause 8 | exit /b 1 9 | ) 10 | 11 | REM Run the conversion 12 | python scanned_pdf_to_udf.py "%~1" 13 | 14 | REM Check if the conversion was successful 15 | IF %ERRORLEVEL% NEQ 0 ( 16 | echo Failed to convert scanned PDF to UDF. 17 | pause 18 | exit /b 1 19 | ) 20 | 21 | echo Scanned PDF successfully converted to UDF. 22 | pause 23 | -------------------------------------------------------------------------------- /docx_to_udf.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | from main import main 4 | 5 | def docx_to_udf(): 6 | if len(sys.argv) < 2: 7 | print("Usage: python main.py input.docx") 8 | sys.exit(1) 9 | 10 | input_file = sys.argv[1] 11 | 12 | if not os.path.isfile(input_file): 13 | print(f"Input file not found: {input_file}") 14 | sys.exit(1) 15 | 16 | filename, ext = os.path.splitext(input_file) 17 | 18 | if ext.lower() == '.docx': 19 | udf_file = filename + '.udf' 20 | main(input_file, udf_file) 21 | else: 22 | print("Please provide a .docx file.") 23 | sys.exit(1) 24 | 25 | if __name__ == '__main__': 26 | docx_to_udf() 27 | -------------------------------------------------------------------------------- /image_processor.py: -------------------------------------------------------------------------------- 1 | import base64 2 | from docx.oxml.ns import qn 3 | from PIL import Image 4 | import io 5 | 6 | def process_image(drawing, document): 7 | try: 8 | inline = drawing.find('.//wp:inline', namespaces={'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) 9 | anchor = drawing.find('.//wp:anchor', namespaces={'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) 10 | 11 | extent = None 12 | if inline is not None: 13 | extent = inline.find('.//wp:extent', namespaces={'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) 14 | elif anchor is not None: 15 | extent = anchor.find('.//wp:extent', namespaces={'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'}) 16 | 17 | if extent is not None: 18 | width = int(extent.get('cx')) // 9525 19 | height = int(extent.get('cy')) // 9525 20 | else: 21 | width = height = 100 22 | 23 | blip = drawing.find('.//a:blip', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}) 24 | if blip is not None: 25 | rId = blip.get(qn('r:embed')) 26 | if rId in document.part.rels: 27 | image_part = document.part.rels[rId].target_part 28 | image_bytes = image_part.blob 29 | 30 | try: 31 | with Image.open(io.BytesIO(image_bytes)) as img: 32 | png_buffer = io.BytesIO() 33 | img.save(png_buffer, format='PNG') 34 | png_bytes = png_buffer.getvalue() 35 | image_data = base64.b64encode(png_bytes).decode('utf-8') 36 | except Exception: 37 | image_data = base64.b64encode(image_bytes).decode('utf-8') 38 | 39 | return image_data, width, height 40 | 41 | except Exception: 42 | pass 43 | 44 | return None, None, None -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | from docx import Document 3 | from paragraph_processor import process_paragraph 4 | from table_processor import process_table 5 | 6 | def main(docx_file, udf_file): 7 | udf_template = ''' 8 |