├── requirements.txt
├── .gitattributes
├── install_reqs.bat
├── udf_to_pdf.bat
├── docx_to_udf.bat
├── udf_to_docx.bat
├── scanned_pdf_to_udf.bat
├── docx_to_udf.py
├── image_processor.py
├── main.py
├── README.md
├── table_processor.py
├── utils.py
├── scanned_pdf_to_udf.py
├── .gitignore
├── paragraph_processor.py
├── udf_to_md.py
├── Docs.md
├── udf_to_pdf.py
└── udf_to_docx.py
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-docx
2 | PyMuPDF
3 | Pillow
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/install_reqs.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: This script installs the packages listed in requirements.txt
3 |
4 | REM Check if requirements file exists
5 | IF NOT EXIST requirements.txt (
6 | echo Requirements file not found!
7 | exit /b 1
8 | )
9 |
10 | REM Install the packages using pip
11 | pip install -r requirements.txt
12 |
13 | REM Check if the installation was successful
14 | IF %ERRORLEVEL% NEQ 0 (
15 | echo Failed to install some packages.
16 | exit /b 1
17 | )
18 |
19 | echo Packages installed successfully.
20 | pause
--------------------------------------------------------------------------------
/udf_to_pdf.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: This script converts a UDF file to PDF format
3 |
4 | REM Check if a UDF file is provided by dragging
5 | IF "%~1"=="" (
6 | echo Please drag a UDF file onto this script to convert it to PDF.
7 | pause
8 | exit /b 1
9 | )
10 |
11 | REM Run the conversion
12 | python udf_to_pdf.py "%~1"
13 |
14 | REM Check if the conversion was successful
15 | IF %ERRORLEVEL% NEQ 0 (
16 | echo Failed to convert UDF to PDF.
17 | pause
18 | exit /b 1
19 | )
20 |
21 | echo UDF successfully converted to PDF.
22 | pause
23 |
--------------------------------------------------------------------------------
/docx_to_udf.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: This script converts a DOCX file to UDF format
3 |
4 | REM Check if a DOCX file is provided by dragging
5 | IF "%~1"=="" (
6 | echo Please drag a DOCX file onto this script to convert it to UDF.
7 | pause
8 | exit /b 1
9 | )
10 |
11 | REM Run the conversion
12 | python docx_to_udf.py "%~1"
13 |
14 | REM Check if the conversion was successful
15 | IF %ERRORLEVEL% NEQ 0 (
16 | echo Failed to convert DOCX to UDF.
17 | pause
18 | exit /b 1
19 | )
20 |
21 | echo DOCX successfully converted to UDF.
22 | pause
23 |
--------------------------------------------------------------------------------
/udf_to_docx.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: This script converts a UDF file to DOCX format
3 |
4 | REM Check if a UDF file is provided by dragging
5 | IF "%~1"=="" (
6 | echo Please drag a UDF file onto this script to convert it to DOCX.
7 | pause
8 | exit /b 1
9 | )
10 |
11 | REM Run the conversion
12 | python udf_to_docx.py "%~1"
13 |
14 | REM Check if the conversion was successful
15 | IF %ERRORLEVEL% NEQ 0 (
16 | echo Failed to convert UDF to DOCX.
17 | pause
18 | exit /b 1
19 | )
20 |
21 | echo UDF successfully converted to DOCX.
22 | pause
23 |
--------------------------------------------------------------------------------
/scanned_pdf_to_udf.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | :: This script converts a scanned PDF file to UDF format
3 |
4 | REM Check if a PDF file is provided by dragging
5 | IF "%~1"=="" (
6 | echo Please drag a PDF file onto this script to convert it to UDF.
7 | pause
8 | exit /b 1
9 | )
10 |
11 | REM Run the conversion
12 | python scanned_pdf_to_udf.py "%~1"
13 |
14 | REM Check if the conversion was successful
15 | IF %ERRORLEVEL% NEQ 0 (
16 | echo Failed to convert scanned PDF to UDF.
17 | pause
18 | exit /b 1
19 | )
20 |
21 | echo Scanned PDF successfully converted to UDF.
22 | pause
23 |
--------------------------------------------------------------------------------
/docx_to_udf.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | from main import main
4 |
5 | def docx_to_udf():
6 | if len(sys.argv) < 2:
7 | print("Usage: python main.py input.docx")
8 | sys.exit(1)
9 |
10 | input_file = sys.argv[1]
11 |
12 | if not os.path.isfile(input_file):
13 | print(f"Input file not found: {input_file}")
14 | sys.exit(1)
15 |
16 | filename, ext = os.path.splitext(input_file)
17 |
18 | if ext.lower() == '.docx':
19 | udf_file = filename + '.udf'
20 | main(input_file, udf_file)
21 | else:
22 | print("Please provide a .docx file.")
23 | sys.exit(1)
24 |
25 | if __name__ == '__main__':
26 | docx_to_udf()
27 |
--------------------------------------------------------------------------------
/image_processor.py:
--------------------------------------------------------------------------------
1 | import base64
2 | from docx.oxml.ns import qn
3 | from PIL import Image
4 | import io
5 |
6 | def process_image(drawing, document):
7 | try:
8 | inline = drawing.find('.//wp:inline', namespaces={'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
9 | anchor = drawing.find('.//wp:anchor', namespaces={'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
10 |
11 | extent = None
12 | if inline is not None:
13 | extent = inline.find('.//wp:extent', namespaces={'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
14 | elif anchor is not None:
15 | extent = anchor.find('.//wp:extent', namespaces={'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing'})
16 |
17 | if extent is not None:
18 | width = int(extent.get('cx')) // 9525
19 | height = int(extent.get('cy')) // 9525
20 | else:
21 | width = height = 100
22 |
23 | blip = drawing.find('.//a:blip', namespaces={'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
24 | if blip is not None:
25 | rId = blip.get(qn('r:embed'))
26 | if rId in document.part.rels:
27 | image_part = document.part.rels[rId].target_part
28 | image_bytes = image_part.blob
29 |
30 | try:
31 | with Image.open(io.BytesIO(image_bytes)) as img:
32 | png_buffer = io.BytesIO()
33 | img.save(png_buffer, format='PNG')
34 | png_bytes = png_buffer.getvalue()
35 | image_data = base64.b64encode(png_bytes).decode('utf-8')
36 | except Exception:
37 | image_data = base64.b64encode(image_bytes).decode('utf-8')
38 |
39 | return image_data, width, height
40 |
41 | except Exception:
42 | pass
43 |
44 | return None, None, None
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import zipfile
2 | from docx import Document
3 | from paragraph_processor import process_paragraph
4 | from table_processor import process_table
5 |
6 | def main(docx_file, udf_file):
7 | udf_template = '''
8 |
9 |
10 |
11 |
12 | {elements}
13 |
14 |
15 | '''
16 |
17 | try:
18 | document = Document(docx_file)
19 | except Exception as e:
20 | print(f"Error loading DOCX file: {e}")
21 | return
22 |
23 | content = []
24 | elements = []
25 | current_offset = 0
26 | EMPTY_PARAGRAPH_PLACEHOLDER = '\u200B' # Zero-width space
27 |
28 | for element in document.element.body:
29 | if element.tag.endswith('p'): # Paragraph
30 | para_text, para_elements = process_paragraph(element, document, current_offset)
31 | elements.append(para_elements)
32 | content.append(para_text)
33 | current_offset += len(para_text)
34 | elif element.tag.endswith('tbl'): # Table
35 | table_text, table_element = process_table(element, document, current_offset)
36 | elements.append(table_element)
37 | content.append(table_text)
38 | current_offset += len(table_text)
39 |
40 | # Ensure there's at least one paragraph after the table
41 | if not content:
42 | content.append(EMPTY_PARAGRAPH_PLACEHOLDER)
43 | elements.append(f'')
44 |
45 | udf_content = udf_template.format(
46 | content=''.join(content),
47 | elements='\n'.join(elements)
48 | )
49 |
50 | try:
51 | with zipfile.ZipFile(udf_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
52 | zipf.writestr('content.xml', udf_content)
53 | print(f"UDF file created successfully: {udf_file}")
54 | except Exception as e:
55 | print(f"Error creating UDF file: {e}")
56 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # UDF Toolkit
2 | UYAP UDF dosya formatı ile ilgili çalışmalar
3 |
4 | [](https://www.star-history.com/#saidsurucu/udf-toolkit&Date)
5 |
6 | ## UDF dosyasını DOCX formatına çevirmek için
7 | ```
8 | python udf_to_docx.py input.udf
9 | ```
10 | ## UDF dosyasını PDF formatına çevirmek için
11 | ```
12 | python udf_to_pdf.py input.udf
13 | ```
14 | ## DOCX dosyasını UDF formatına çevirmek için
15 | ```
16 | python docx_to_udf.py input.docx
17 | ```
18 | Not: En iyi sonucu almak için Windows'ta çalıştırılmalıdır. Bazı DOCX özelliklerini dönüştürmek için Windows kütüphaneleri gereklidir. MacOS ve Linux'ta sonuçlar farklı olabilir.
19 | ## PDF dosyasını (imaj olarak) UDF formatına çevirmek için
20 | ```
21 | python scanned_pdf_to_udf.py input.pdf
22 | ```
23 | # Teknik Bilgiye Sahip Olmayanlar İçin Windows'ta Kullanım Talimatları
24 |
25 | Bu scriptlerin düzgün çalışabilmesi için Python'un sisteminizde kurulu olması gerekmektedir. Aşağıdaki adımları takip ederek Python'u yükleyebilirsiniz:
26 |
27 | 1. [Python'un resmi web sitesine](https://www.python.org/downloads/) gidin.
28 | 2. Sisteminizin işletim sistemine uygun Python sürümünü indirin (genellikle en son sürüm önerilir).
29 | 3. Kurulum sırasında "Add Python to PATH" seçeneğini işaretleyin.
30 |
31 | ## Kodu İndirmek
32 | Sağ üstteki yeşil renkli `Code` butonuna tıklayın. `Download ZIP`'e tıklayın. İnen sıkıştırılmış ZIP dosyasını bir klasöre çıkartın.
33 |
34 | ### 1. `install_requirements.bat`
35 | - **Amaç**: `requirements.txt` dosyasında listelenen gerekli Python paketlerini yükler.
36 | - **Nasıl Kullanılır**: `install_requirements.bat` scriptine çift tıklayın. Bu, `requirements.txt` dosyasında belirtilen tüm gerekli bağımlılıkları yükleyecektir.
37 |
38 | ### 1. `udf_to_docx.bat`
39 | - **Amaç**: UDF dosyasını DOCX formatına dönüştürür.
40 | - **Nasıl Kullanılır**: `.udf` dosyasını `udf_to_docx.bat` scriptinin üzerine sürükleyin. Script çalışacak ve girdi ile aynı dizinde bir `.docx` dosyası oluşturacaktır.
41 |
42 | ### 2. `udf_to_pdf.bat`
43 | - **Amaç**: UDF dosyasını PDF formatına dönüştürür.
44 | - **Nasıl Kullanılır**: `.udf` dosyasını `udf_to_pdf.bat` scriptinin üzerine sürükleyin. Script çalışacak ve girdi ile aynı dizinde bir `.pdf` dosyası oluşturacaktır.
45 |
46 | ### 3. `docx_to_udf.bat`
47 | - **Amaç**: DOCX dosyasını UDF formatına dönüştürür.
48 | - **Nasıl Kullanılır**: `.docx` dosyasını `docx_to_udf.bat` scriptinin üzerine sürükleyin. Script çalışacak ve girdi ile aynı dizinde bir `.udf` dosyası oluşturacaktır.
49 |
50 | ### 4. `scanned_pdf_to_udf.bat`
51 | - **Amaç**: Tarama yapılmış bir PDF dosyasını UDF formatına dönüştürür.
52 | - **Nasıl Kullanılır**: `.pdf` dosyasını `scanned_pdf_to_udf.bat` scriptinin üzerine sürükleyin. Script çalışacak ve girdi ile aynı dizinde bir `.udf` dosyası oluşturacaktır.
53 |
54 |
55 | ## UDF Formatı Dokümantasyonu
56 | [Docs.md](./Docs.md)
57 |
--------------------------------------------------------------------------------
/table_processor.py:
--------------------------------------------------------------------------------
1 | from docx.oxml.ns import qn
2 | from paragraph_processor import process_paragraph
3 |
4 | def process_table(table, document, current_offset):
5 | table_text = ""
6 | rows = []
7 | grid_cols = table.findall('.//w:gridCol', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
8 | column_count = len(grid_cols)
9 |
10 | # Calculate column widths
11 | total_width = sum(int(col.get(qn('w:w'), '0')) for col in grid_cols)
12 | column_widths = [int(col.get(qn('w:w'), '0')) for col in grid_cols]
13 | column_spans = ",".join([str(int((width / total_width) * 300)) for width in column_widths]) # Scale to 300
14 |
15 | # Check table borders
16 | tblBorders = table.find('.//w:tblBorders', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
17 | border_type = "borderCell" # Default to visible borders
18 | if tblBorders is not None:
19 | border_elements = ['top', 'left', 'bottom', 'right', 'insideH', 'insideV']
20 | all_borders_none = all(
21 | tblBorders.find(f'.//w:{border}', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) is None or
22 | tblBorders.find(f'.//w:{border}', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}).get(qn('w:val')) in ['none', 'nil', '0']
23 | for border in border_elements
24 | )
25 | if all_borders_none:
26 | border_type = "borderNone"
27 | else:
28 | # If tblBorders is not defined, assume borderless table
29 | border_type = "borderNone"
30 |
31 | for row_index, row in enumerate(table.findall('.//w:tr', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})):
32 | cells = []
33 | for cell in row.findall('.//w:tc', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
34 | cell_text, cell_elements = process_cell(cell, document, current_offset)
35 | cells.append(f'| {"".join(cell_elements)} | ')
36 | table_text += cell_text
37 | current_offset += len(cell_text)
38 |
39 | rows.append(f'{"".join(cells)}
')
40 |
41 | table_element = f'
'
42 | return table_text, table_element
43 |
44 |
45 | def process_cell(cell, document, current_offset):
46 | cell_text = ""
47 | cell_elements = []
48 | paragraphs = cell.findall('.//w:p', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
49 |
50 | for i, paragraph in enumerate(paragraphs):
51 | para_text, para_elements = process_paragraph(paragraph, document, current_offset)
52 | cell_text += para_text
53 | cell_elements.extend(para_elements)
54 | current_offset += len(para_text)
55 |
56 | # Add a line break between paragraphs, but not after the last paragraph
57 | if i < len(paragraphs) - 1 and para_text.strip():
58 | cell_text += '\n'
59 | cell_elements.append(f'')
60 | current_offset += 1
61 |
62 | # If cell is empty, add a space character
63 | if not cell_text:
64 | cell_text = " "
65 | cell_elements.append(f'')
66 | current_offset += 1
67 |
68 | return cell_text, cell_elements
69 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from docx.oxml.ns import qn
2 |
3 | def get_alignment(paragraph):
4 | alignment = paragraph.find('.//w:jc', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
5 | if alignment is not None:
6 | val = alignment.get(qn('w:val'))
7 | if val == 'center':
8 | return '1'
9 | elif val == 'right':
10 | return '2'
11 | elif val == 'both':
12 | return '3'
13 | return '0' # Default to Left
14 |
15 | def get_indent_attrs(paragraph):
16 | ind = paragraph.find('.//w:ind', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
17 | left = ind.get(qn('w:left')) if ind is not None else None
18 | right = ind.get(qn('w:right')) if ind is not None else None
19 | firstLine = ind.get(qn('w:firstLine')) if ind is not None else None
20 |
21 | indent_attrs = f'LeftIndent="{float(left) / 20 if left else 0.0}" RightIndent="{float(right) / 20 if right else 0.0}"'
22 |
23 | if firstLine:
24 | indent_attrs += f' FirstLineIndent="{float(firstLine) / 20}"'
25 |
26 | return indent_attrs
27 |
28 | def get_bullet_attrs(paragraph):
29 | numPr = paragraph.find('.//w:numPr', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
30 | if numPr is not None:
31 | ilvl = numPr.find('.//w:ilvl', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
32 | numId = numPr.find('.//w:numId', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
33 | if ilvl is not None and numId is not None:
34 | bullet_type = get_bullet_type(numId.get(qn("w:val")))
35 | return f'Bulleted="true" ListId="{numId.get(qn("w:val"))}" ListLevel="{int(ilvl.get(qn("w:val"))) + 1}" BulletType="{bullet_type}"'
36 | return ''
37 |
38 | def get_bullet_type(num_id):
39 | bullet_types = {
40 | "1": "BULLET_TYPE_ELLIPSE",
41 | "2": "BULLET_TYPE_RECTANGLE",
42 | "3": "BULLET_TYPE_RECTANGLE_D",
43 | "4": "BULLET_TYPE_ARROW",
44 | "5": "BULLET_TYPE_DIAMOND",
45 | "6": "BULLET_TYPE_TRIANGLE",
46 | }
47 | return bullet_types.get(num_id, "BULLET_TYPE_ELLIPSE") # Default to ELLIPSE
48 |
49 | def get_font_properties(run):
50 | font_family = run.findtext('.//w:rFonts[@w:ascii]', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) or "Times New Roman"
51 | font_size = run.findtext('.//w:sz', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) or "20"
52 | font_size = str(int(font_size) // 2) # Convert half-points to points
53 |
54 | style_attrs = [f'family="{font_family}"', f'size="{font_size}"']
55 | if run.find('.//w:b', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) is not None:
56 | style_attrs.append('bold="true"')
57 | if run.find('.//w:i', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) is not None:
58 | style_attrs.append('italic="true"')
59 |
60 | return ' '.join(style_attrs)
61 |
62 | def get_line_spacing(paragraph):
63 | spacing = paragraph.find('.//w:spacing', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
64 | if spacing is not None:
65 | line = spacing.get(qn('w:line'))
66 | lineRule = spacing.get(qn('w:lineRule'))
67 | if line and lineRule:
68 | if lineRule == 'auto':
69 | # Convert to UDF line spacing (DOCX 2.0 = UDF 1.0)
70 | return max(0, (float(line) / 240) - 1)
71 | elif lineRule == 'exact' or lineRule == 'atLeast':
72 | # Convert twips to points and adjust for UDF
73 | return max(0, (float(line) / 20) - 12)
74 | return 0.0 # Default to single spacing in UDF
75 |
--------------------------------------------------------------------------------
/scanned_pdf_to_udf.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import zipfile
4 | import base64
5 | import fitz # PyMuPDF
6 | import io
7 | from PIL import Image
8 |
9 | def pdf_to_udf(pdf_file, udf_file):
10 | udf_template = '''
11 |
12 |
13 |
14 |
15 | {elements}
16 |
17 |
18 | '''
19 |
20 | try:
21 | pdf_document = fitz.open(pdf_file)
22 | content = []
23 | elements = []
24 | current_offset = 0
25 |
26 | for page_num in range(len(pdf_document)):
27 | page = pdf_document[page_num]
28 |
29 | # Extract text
30 | text = page.get_text()
31 | if text:
32 | content.append(text)
33 | elements.append(f'')
34 | current_offset += len(text)
35 |
36 | # Extract images
37 | image_list = page.get_images(full=True)
38 | for img_index, img in enumerate(image_list):
39 | xref = img[0]
40 | base_image = pdf_document.extract_image(xref)
41 | image_bytes = base_image["image"]
42 |
43 | # Convert image to base64
44 | image = Image.open(io.BytesIO(image_bytes))
45 | buffered = io.BytesIO()
46 | image.save(buffered, format="PNG")
47 | img_str = base64.b64encode(buffered.getvalue()).decode()
48 |
49 | # Add placeholder for image in content
50 | placeholder = '\uFFFC' # Object Replacement Character
51 | content.append(placeholder)
52 |
53 | # Add image element
54 | elements.append(f'')
55 | current_offset += 1
56 |
57 | # Add a newline between pages
58 | content.append('\n')
59 | elements.append(f'')
60 | current_offset += 1
61 |
62 | udf_content = udf_template.format(
63 | content=''.join(content),
64 | elements='\n'.join(elements)
65 | )
66 |
67 | with zipfile.ZipFile(udf_file, 'w', zipfile.ZIP_DEFLATED) as zipf:
68 | zipf.writestr('content.xml', udf_content)
69 | print(f"UDF file created successfully: {udf_file}")
70 | except Exception as e:
71 | print(f"Error creating UDF file: {e}")
72 |
73 | def main():
74 | if len(sys.argv) < 2:
75 | print("Usage: python pdf_to_udf.py input.pdf")
76 | sys.exit(1)
77 |
78 | input_file = sys.argv[1]
79 |
80 | if not os.path.isfile(input_file):
81 | print(f"Input file not found: {input_file}")
82 | sys.exit(1)
83 |
84 | filename, ext = os.path.splitext(input_file)
85 |
86 | if ext.lower() == '.pdf':
87 | udf_file = filename + '.udf'
88 | pdf_to_udf(input_file, udf_file)
89 | else:
90 | print("Please provide a .pdf file.")
91 | sys.exit(1)
92 |
93 | if __name__ == '__main__':
94 | main()
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | ~$test.docx
162 | .DS_Store
163 | .python-version
164 | hello.py
165 | pdf-test.pdf
166 | pyproject.toml
167 | *.udf
168 | *.lock
169 | *.pdf
170 | *.xml
171 | *.png
172 | *.docx
173 |
--------------------------------------------------------------------------------
/paragraph_processor.py:
--------------------------------------------------------------------------------
1 | from docx.oxml.ns import qn
2 | from image_processor import process_image
3 | from utils import get_alignment, get_indent_attrs, get_bullet_attrs
4 |
5 | def process_paragraph(paragraph, document, current_offset):
6 | EMPTY_PARAGRAPH_PLACEHOLDER = '\u200B' # Zero-width space
7 | TAB_CHARACTER = '\t' # Tab character
8 |
9 | para_text = ""
10 | para_elements = []
11 |
12 | # Numaralandırma ve madde işareti özelliklerini al
13 | numPr = paragraph.find('.//w:numPr', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
14 | numbered = False
15 | list_id = ""
16 | list_level = ""
17 | number_type = ""
18 |
19 | if numPr is not None:
20 | ilvl = numPr.find('.//w:ilvl', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
21 | numId = numPr.find('.//w:numId', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
22 | if ilvl is not None and numId is not None:
23 | numbered = True
24 | list_id = numId.get(qn("w:val"))
25 | list_level = str(int(ilvl.get(qn("w:val"))) + 1)
26 | number_type = get_number_type(list_id)
27 |
28 | for run in paragraph.findall('.//w:r', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}):
29 | # Process images in the run
30 | drawing_elements = run.findall('.//w:drawing', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})
31 | if drawing_elements:
32 | for drawing in drawing_elements:
33 | image_data, width, height = process_image(drawing, document)
34 | if image_data:
35 | # Insert a placeholder character in content
36 | placeholder = '\uFFFC' # Object Replacement Character
37 | para_text += placeholder
38 |
39 | # Add image element
40 | para_elements.append(
41 | f''
43 | )
44 | current_offset += 1
45 | else:
46 | print("Failed to process image, skipping...")
47 |
48 |
49 |
50 | # Process text and tab characters in the run
51 | text = run.findtext('.//w:t', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) or ''
52 | if text or run.find('.//w:tab', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) is not None:
53 | # Get font properties
54 | font_family = run.findtext('.//w:rFonts[@w:ascii]', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) or "Times New Roman"
55 | font_size = run.findtext('.//w:sz', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) or "20"
56 | font_size = str(int(font_size) // 2) # Convert half-points to points
57 |
58 | style_attrs = [f'family="{font_family}"', f'size="{font_size}"']
59 | if run.find('.//w:b', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) is not None:
60 | style_attrs.append('bold="true"')
61 | if run.find('.//w:i', namespaces={'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}) is not None:
62 | style_attrs.append('italic="true"')
63 |
64 | style_attr_str = ' '.join(style_attrs)
65 |
66 | # Process text and tab characters
67 | for child in run:
68 | if child.tag.endswith('}t'): # Text
69 | para_elements.append(f'')
70 | para_text += child.text
71 | current_offset += len(child.text)
72 | elif child.tag.endswith('}tab'): # Tab
73 | para_elements.append(f'')
74 | para_text += TAB_CHARACTER
75 | current_offset += 1
76 |
77 | # If paragraph is empty, add placeholder
78 | if not para_text:
79 | para_text = EMPTY_PARAGRAPH_PLACEHOLDER
80 | para_elements.append(f'')
81 | current_offset += 1
82 |
83 | # Numaralandırma ve madde işareti özelliklerini paragraf elementine ekle
84 | paragraph_attrs = f'Alignment="{get_alignment(paragraph)}" {get_indent_attrs(paragraph)}'
85 | if numbered:
86 | if number_type.startswith("NUMBER_TYPE_"):
87 | paragraph_attrs += f' Numbered="true" ListId="{list_id}" ListLevel="{list_level}" NumberType="{number_type}"'
88 | else:
89 | paragraph_attrs += f' Bulleted="true" ListId="{list_id}" ListLevel="{list_level}" BulletType="{number_type}"'
90 |
91 | paragraph_element = f'{"".join(para_elements)}'
92 | return para_text, paragraph_element
93 |
94 | def get_number_type(list_id):
95 | # Bu fonksiyonu, belgenizin numaralandırma tanımlarına göre özelleştirmeniz gerekebilir
96 | number_types = {
97 | "1": "NUMBER_TYPE_CHAR_SMALL_DOT",
98 | "2": "BULLET_TYPE_ARROW",
99 | "3": "NUMBER_TYPE_ROMAN_BIG_DOT",
100 | "4": "NUMBER_TYPE_CHAR_BIG_DOT",
101 | "5": "NUMBER_TYPE_CHAR_SMALL_PARANTHESE",
102 | "6": "NUMBER_TYPE_NUMBER_TRE",
103 | "7": "NUMBER_TYPE_ROMAN_SMALL_DOT",
104 | "8": "BULLET_TYPE_ELLIPSE",
105 | "9": "BULLET_TYPE_RECTANGLE",
106 | "10": "BULLET_TYPE_RECTANGLE_D",
107 | "11": "NUMBER_TYPE_NUMBER_PARANTHESE",
108 | "12": "BULLET_TYPE_DIAMOND",
109 | "13": "BULLET_TYPE_TRIANGLE",
110 | # Diğer numaralandırma ve madde işareti türlerini buraya ekleyin
111 | }
112 | return number_types.get(list_id, "NUMBER_TYPE_NUMBER_TRE") # Varsayılan olarak NUMBER_TYPE_NUMBER_TRE kullan
113 |
--------------------------------------------------------------------------------
/udf_to_md.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import xml.etree.ElementTree as ET
4 | import zipfile
5 | import base64
6 | import io
7 |
8 | def is_zip_file(file_path):
9 | """Check if the file is a valid ZIP file"""
10 | try:
11 | with zipfile.ZipFile(file_path, 'r') as z:
12 | return True
13 | except zipfile.BadZipFile:
14 | return False
15 |
16 | def udf_to_markdown(udf_file):
17 | root = None
18 |
19 | # Check if the file is a ZIP file
20 | if is_zip_file(udf_file):
21 | # Process as a ZIP file
22 | with zipfile.ZipFile(udf_file, 'r') as z:
23 | if 'content.xml' in z.namelist():
24 | with z.open('content.xml') as content_file:
25 | tree = ET.parse(content_file)
26 | root = tree.getroot()
27 | else:
28 | print("The 'content.xml' file could not be found in the UDF file.")
29 | exit()
30 | else:
31 | # Process as an XML file directly
32 | try:
33 | tree = ET.parse(udf_file)
34 | root = tree.getroot()
35 | except ET.ParseError:
36 | print(f"The file {udf_file} is neither a valid ZIP nor a valid XML file.")
37 | exit()
38 |
39 | if root is None:
40 | print("Failed to parse the file.")
41 | exit()
42 |
43 | # Initialize the markdown output
44 | markdown_output = ""
45 |
46 | # Create a dictionary for style definitions
47 | styles = {}
48 |
49 | # Retrieve style information
50 | styles_element = root.find('styles')
51 | if styles_element is not None:
52 | for style in styles_element.findall('style'):
53 | style_name = style.get('name')
54 | style_attributes = {
55 | 'family': style.get('family'),
56 | 'size': int(style.get('size', 12)),
57 | 'bold': style.get('bold', 'false') == 'true',
58 | 'italic': style.get('italic', 'false') == 'true',
59 | 'foreground': int(style.get('foreground', '-13421773')),
60 | }
61 | styles[style_name] = style_attributes
62 |
63 | # Retrieve content text
64 | content_element = root.find('content')
65 | if content_element is not None:
66 | content_text = content_element.text
67 | if content_text and content_text.startswith(''):
68 | content_text = content_text[9:-3]
69 | else:
70 | print("'content' could not be found in the XML.")
71 | exit()
72 |
73 | # Process the 'elements' section
74 | elements_element = root.find('elements')
75 |
76 | if elements_element is not None:
77 | for elem in elements_element:
78 | if elem.tag == 'paragraph':
79 | # Handle the paragraph
80 | paragraph_text = ""
81 |
82 | # Set paragraph alignment (we'll add this as HTML in markdown since markdown doesn't have native alignment)
83 | alignment = elem.get('Alignment', '0')
84 | alignment_tag = ""
85 | if alignment == '1':
86 | alignment_tag = ""
87 | elif alignment == '2':
88 | alignment_tag = "
"
89 | elif alignment == '3':
90 | alignment_tag = "
"
91 |
92 | # Process the paragraph content
93 | for child in elem:
94 | if child.tag == 'content':
95 | # Get the text
96 | start_offset = int(child.get('startOffset', '0'))
97 | length = int(child.get('length', '0'))
98 | text = content_text[start_offset:start_offset+length]
99 |
100 | # Apply formatting
101 | if child.get('bold', 'false') == 'true' and child.get('italic', 'false') == 'true':
102 | text = f"***{text}***"
103 | elif child.get('bold', 'false') == 'true':
104 | text = f"**{text}**"
105 | elif child.get('italic', 'false') == 'true':
106 | text = f"*{text}*"
107 |
108 | paragraph_text += text
109 |
110 | elif child.tag == 'space':
111 | paragraph_text += " "
112 | elif child.tag == 'image':
113 | # For images, we'll just add a placeholder in markdown
114 | paragraph_text += "[Image]"
115 |
116 | # Apply alignment if needed
117 | if alignment_tag:
118 | paragraph_text = f"{alignment_tag}{paragraph_text}
"
119 |
120 | markdown_output += paragraph_text + "\n\n"
121 |
122 | elif elem.tag == 'table':
123 | # Handle tables
124 | column_count = int(elem.get('columnCount', '1'))
125 | rows = elem.findall('row')
126 |
127 | # Create table header row with correct number of columns
128 | markdown_output += "| " + " | ".join(["Column"] * column_count) + " |\n"
129 | markdown_output += "| " + " | ".join(["---"] * column_count) + " |\n"
130 |
131 | for row in rows:
132 | cells = row.findall('cell')
133 | row_text = "| "
134 |
135 | for cell in cells:
136 | cell_text = ""
137 | paragraphs = cell.findall('paragraph')
138 |
139 | for para in paragraphs:
140 | para_text = ""
141 |
142 | for child in para:
143 | if child.tag == 'content':
144 | # Get the text
145 | start_offset = int(child.get('startOffset', '0'))
146 | length = int(child.get('length', '0'))
147 | text = content_text[start_offset:start_offset+length]
148 |
149 | # Apply formatting
150 | if child.get('bold', 'false') == 'true' and child.get('italic', 'false') == 'true':
151 | text = f"***{text}***"
152 | elif child.get('bold', 'false') == 'true':
153 | text = f"**{text}**"
154 | elif child.get('italic', 'false') == 'true':
155 | text = f"*{text}*"
156 |
157 | para_text += text
158 |
159 | elif child.tag == 'space':
160 | para_text += " "
161 | elif child.tag == 'image':
162 | para_text += "[Image]"
163 |
164 | cell_text += para_text + " "
165 |
166 | # Remove pipe characters from cell content as they would break the markdown table
167 | cell_text = cell_text.replace("|", "\\|").strip()
168 | row_text += cell_text + " | "
169 |
170 | markdown_output += row_text + "\n"
171 |
172 | markdown_output += "\n"
173 | else:
174 | print("'elements' could not be found in the XML.")
175 |
176 | return markdown_output
177 |
178 | def main():
179 | if len(sys.argv) < 2:
180 | print("Usage: python udf_to_markdown.py input.udf")
181 | exit()
182 |
183 | udf_file = sys.argv[1]
184 |
185 | if not os.path.isfile(udf_file):
186 | print(f"Input file not found: {udf_file}")
187 | exit()
188 |
189 | # Convert UDF to markdown and print to console
190 | markdown_content = udf_to_markdown(udf_file)
191 | print(markdown_content)
192 |
193 | # Optionally save to a file
194 | filename, ext = os.path.splitext(udf_file)
195 | markdown_file = filename + '.md'
196 | with open(markdown_file, 'w', encoding='utf-8') as md_file:
197 | md_file.write(markdown_content)
198 | print(f"Markdown file created: {markdown_file}")
199 |
200 | if __name__ == '__main__':
201 | main()
--------------------------------------------------------------------------------
/Docs.md:
--------------------------------------------------------------------------------
1 | # UYAP UDF Dosya Formatı
2 |
3 | ## İçindekiler
4 |
5 | 1. [Genel Bakış](#genel-bakış)
6 | 2. [UDF Dosya Yapısı](#udf-dosya-yapısı)
7 | 3. [XML Yapısı](#xml-yapısı)
8 | 4. [Kök Eleman](#kök-eleman)
9 | 5. [Ana Bölümler](#ana-bölümler)
10 | * [İçerik Bölümü (`
`)](#içerik-bölümü-content)
11 | * [Özellikler Bölümü (``)](#özellikler-bölümü-properties)
12 | * [Elemanlar Bölümü (``)](#elemanlar-bölümü-elements)
13 | * [Stiller Bölümü (``)](#stiller-bölümü-styles)
14 | * [Veri Bölümü (``) (Varsayımsal)](#veri-bölümü-data-varsayımsal)
15 | 6. [Detaylı Eleman Açıklamaları ve Özellik Örnekleri](#detaylı-eleman-açıklamaları-ve-özellik-örnekleri)
16 | * [Üstbilgi (``)](#üstbilgi-header)
17 | * [Altbilgi (`