├── .devcontainer └── devcontainer.json ├── .github └── FUNDING.yml ├── .gitignore ├── .streamlit └── config.toml ├── LICENSE ├── README.md ├── app.py ├── assets └── sidebar.html ├── packages.txt ├── requirements.txt └── utils ├── helpers.py ├── init_session_states.py ├── page_config.py ├── render_sidebar.py └── version.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "app.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y 10 | Open in Streamlit 11 | 12 |

13 | 14 | ## Features 🚀 15 | 16 | PDF WorkDesk is designed to be a user-friendly, lightweight tool that empowers you to handle PDFs with ease and efficiency. Here's what you can do: 17 | 18 | - **Upload PDFs**: Choose files from your disk or load them directly via URL. 19 | - **Preview PDFs**: Instantly view your PDF's contents and metadata. 20 | - **Extract Data**: Seamlessly extract text, images, and tables from your documents. 21 | - **Secure PDFs**: Encrypt or decrypt your PDFs for added security. 22 | - **Modify PDFs**: Rotate, resize, merge, or convert PDFs to Word with simple clicks. 23 | - **Watermark PDFs**: Add a label to watermark your PDFs. 24 | - **Reduce Size**: Reduce your PDF's file size without compromising quality. 25 | 26 | And the best part? It's completely **free**, **open-source**, and respects your privacy by not requiring any personal information! 🛡️ 27 | 28 | ## Get Involved! 🤝 29 | 30 | Your feedback and contributions can help shape the future of PDF WorkDesk. If you have ideas or features you'd like to see, let's collaborate! 31 | 32 | - **Contribute**: Submit PRs or open issues on GitHub. 33 | - **Connect**: Have questions or suggestions? Reach out to me on [LinkedIn](https://linkedin.com/in/siddhantsadangi). 34 | 35 | ## Acknowledgements 🤗 36 | 37 | This app is based on [pypdf](https://github.com/py-pdf/pypdf) and basically provides a UI for non-technical users to use supported features of pypdf. Thanks to the pypdf team 🫶 38 | Also, thanks as usual to the team behind and the community surrounding [Streamlit](https://streamlit.io/) 🎈 39 | 40 | Special thanks to the following individuals for their contributions: 41 | 42 | - [Pierre-Louis BESCOND](https://github.com/pierrelouisbescond) for contributing the PDF to Word conversion code 43 | - [Tomasz Hasiów](https://discuss.streamlit.io/u/TomJohn/summary) for his help with forcing dark-mode and introducing me to CSS injection 44 | - [Émilien Foissotte](https://github.com/Emilien-Foissotte) for implementing the watermark functionality 45 | 46 | Finally, to all open-source projects out there used either directly or indirectly 🙇 47 | 48 | ## Support PDF WorkDesk 💖 49 | 50 | Love PDF WorkDesk? Here's how you can show your support: 51 | 52 | - **Star**: Give us a star on GitHub and help spread the word! 53 | - **Share**: Tell your friends and colleagues about us on social media. 54 | - **Donate**: Buy me a coffee or sponsor me on GitHub! 55 | 56 |

57 | Buy Me A Coffee 58 | 59 | Sponsor me on GitHub 62 |

63 | 64 | Thank you for supporting PDF WorkDesk! 🤗 65 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | try: 2 | import os 3 | import sys 4 | import traceback 5 | from io import BytesIO 6 | 7 | import streamlit as st 8 | from pypdf import PaperSize, PdfReader, PdfWriter, Transformation 9 | from pypdf.errors import FileNotDecryptedError 10 | from streamlit import session_state 11 | from streamlit_pdf_viewer import pdf_viewer 12 | 13 | from utils import helpers, init_session_states, page_config, render_sidebar 14 | 15 | page_config.set() 16 | 17 | # ---------- HEADER ---------- 18 | st.title("📄 PDF WorkDesk!") 19 | st.write( 20 | "User-friendly, lightweight, and open-source tool to preview and extract content and metadata from PDFs, add or remove passwords, modify, merge, convert and compress PDFs." 21 | ) 22 | 23 | init_session_states.init() 24 | 25 | render_sidebar.render() 26 | 27 | # ---------- OPERATIONS ---------- 28 | # TODO: Extract attachments (https://pypdf.readthedocs.io/en/stable/user/extract-attachments.html) 29 | # TODO: Undo last operation 30 | # TODO: Update metadata (https://pypdf.readthedocs.io/en/stable/user/metadata.html) 31 | 32 | try: 33 | ( 34 | pdf, 35 | reader, 36 | session_state["password"], 37 | session_state["is_encrypted"], 38 | ) = helpers.load_pdf(key="main") 39 | 40 | except FileNotDecryptedError: 41 | pdf = "password_required" 42 | 43 | if pdf == "password_required": 44 | st.error("PDF is password protected. Please enter the password to proceed.") 45 | elif pdf: 46 | lcol, rcol = st.columns(2) 47 | with lcol.expander(label="🔍 Extract text"): 48 | extract_text_lcol, extract_text_rcol = st.columns(2) 49 | 50 | page_numbers_str = helpers.select_pages( 51 | container=extract_text_lcol, 52 | key="extract_text_pages", 53 | ) 54 | 55 | mode = extract_text_rcol.radio( 56 | "Extraction mode", 57 | options=["plain", "layout"], 58 | horizontal=True, 59 | help="Layout mode extracts text in a format resembling the layout of the source PDF", 60 | ) 61 | 62 | if page_numbers_str: 63 | try: 64 | text = helpers.extract_text(reader, page_numbers_str, mode) 65 | except (IndexError, ValueError): 66 | st.error("Specified pages don't exist. Check the format.", icon="⚠️") 67 | else: 68 | st.text(text) 69 | 70 | with open("text.txt", "w", encoding="utf-8") as f: 71 | f.write(text) 72 | 73 | with open("text.txt") as f: 74 | st.download_button( 75 | "💾 Download extracted text", 76 | data=f, 77 | use_container_width=True, 78 | ) 79 | 80 | with rcol.expander(label="️🖼️ Extract images"): 81 | if page_numbers_str := helpers.select_pages( 82 | container=st, 83 | key="extract_image_pages", 84 | ): 85 | try: 86 | images = helpers.extract_images(reader, page_numbers_str) 87 | except (IndexError, ValueError): 88 | st.error("Specified pages don't exist. Check the format.", icon="⚠️") 89 | else: 90 | if images: 91 | for data, name in images.items(): 92 | st.image(data, caption=name) 93 | else: 94 | st.info("No images found") 95 | 96 | with lcol.expander("📊 Extract table"): 97 | if page_numbers_str := helpers.select_pages( 98 | container=st, 99 | key="extract_table_pages", 100 | ): 101 | helpers.extract_tables( 102 | session_state["file"], 103 | page_numbers_str, 104 | ) 105 | 106 | with rcol.expander("🔄️ Convert to Word"): 107 | st.caption("Takes ~1 second/page. Will remove password if present") 108 | 109 | if st.button("Convert PDF to Word", use_container_width=True): 110 | st.download_button( 111 | "📥 Download Word document", 112 | data=helpers.convert_pdf_to_word(pdf), 113 | file_name=f"{session_state['name'][:-4]}.docx", 114 | mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document", 115 | use_container_width=True, 116 | ) 117 | 118 | with lcol.expander( 119 | f"🔐 {'Change' if session_state['is_encrypted'] else 'Add'} password" 120 | ): 121 | new_password = st.text_input( 122 | "Enter password", 123 | type="password", 124 | ) 125 | 126 | algorithm = st.selectbox( 127 | "Algorithm", 128 | options=["RC4-40", "RC4-128", "AES-128", "AES-256-R5", "AES-256"], 129 | index=3, 130 | help="Use `RC4` for compatibility and `AES` for security", 131 | ) 132 | 133 | filename = f"protected_{session_state['name']}" 134 | 135 | if st.button( 136 | "🔒 Submit", 137 | use_container_width=True, 138 | disabled=(len(new_password) == 0), 139 | ): 140 | with PdfWriter() as writer: 141 | # Add all pages to the writer 142 | for page in reader.pages: 143 | writer.add_page(page) 144 | 145 | # Add a password to the new PDF 146 | writer.encrypt(new_password, algorithm=algorithm) 147 | 148 | # Save the new PDF to a file 149 | with open(filename, "wb") as f: 150 | writer.write(f) 151 | 152 | if os.path.exists(filename): 153 | st.download_button( 154 | "📥 Download protected PDF", 155 | data=open(filename, "rb"), 156 | mime="application/pdf", 157 | file_name=filename, 158 | use_container_width=True, 159 | ) 160 | 161 | with rcol.expander("🔓 Remove password"): 162 | if reader.is_encrypted: 163 | st.download_button( 164 | "📥 Download unprotected PDF", 165 | data=open(session_state["decrypted_filename"], "rb"), 166 | mime="application/pdf", 167 | file_name=session_state["decrypted_filename"], 168 | use_container_width=True, 169 | ) 170 | else: 171 | st.info("PDF does not have a password") 172 | 173 | with lcol.expander("🔃 Rotate PDF"): 174 | # TODO: Add password back to converted PDF if original was protected 175 | st.caption("Will remove password if present") 176 | angle = st.slider( 177 | "Clockwise angle", 178 | min_value=0, 179 | max_value=270, 180 | step=90, 181 | format="%d°", 182 | ) 183 | 184 | with PdfWriter() as writer: 185 | for page in reader.pages: 186 | writer.add_page(page) 187 | writer.pages[-1].rotate(angle) 188 | 189 | # TODO: Write to byte_stream 190 | writer.write("rotated.pdf") 191 | 192 | with open("rotated.pdf", "rb") as f: 193 | pdf_viewer(f.read(), height=250, width=300) 194 | st.download_button( 195 | "📥 Download rotated PDF", 196 | data=f, 197 | mime="application/pdf", 198 | file_name=f"{session_state['name'].rsplit('.')[0]}_rotated_{angle}.pdf", 199 | use_container_width=True, 200 | ) 201 | 202 | with rcol.expander("↔ Resize/Scale PDF"): 203 | # TODO: Add password back to converted PDF if original was protected 204 | st.caption("Will remove password if present") 205 | new_size = st.selectbox( 206 | "New size", 207 | options={ 208 | attr: getattr(PaperSize, attr) 209 | for attr in dir(PaperSize) 210 | if not attr.startswith("__") 211 | and not callable(getattr(PaperSize, attr)) 212 | }, 213 | index=4, 214 | help="Changes will be apparant only on printing the PDF", 215 | ) 216 | 217 | scale_content = st.slider( 218 | "Scale content", 219 | min_value=0.1, 220 | max_value=2.0, 221 | step=0.1, 222 | value=1.0, 223 | help="Scale content independently of the page size", 224 | format="%fx", 225 | ) 226 | 227 | with PdfWriter() as writer: 228 | for page in reader.pages: 229 | page.scale_to( 230 | width=getattr(PaperSize, new_size).width, 231 | height=getattr(PaperSize, new_size).height, 232 | ) 233 | op = Transformation().scale(sx=scale_content, sy=scale_content) 234 | page.add_transformation(op) 235 | writer.add_page(page) 236 | 237 | # TODO: Write to byte_stream 238 | writer.write("scaled.pdf") 239 | 240 | with open("scaled.pdf", "rb") as f: 241 | st.caption("Content scaling preview") 242 | pdf_viewer(f.read(), height=250, width=300) 243 | st.download_button( 244 | "📥 Download scaled PDF", 245 | data=f, 246 | mime="application/pdf", 247 | file_name=f"{session_state['name'].rsplit('.')[0]}_scaled_{new_size}_{scale_content}x.pdf", 248 | use_container_width=True, 249 | ) 250 | 251 | with lcol.expander("➕ Merge PDFs"): 252 | # TODO: Add password back to converted PDF if original was protected 253 | st.caption( 254 | "Second PDF will be appended to the first. Passwords will be removed from both." 255 | ) 256 | # TODO: Add more merge options (https://pypdf.readthedocs.io/en/stable/user/merging-pdfs.html#showing-more-merging-options) 257 | pdf_to_merge, reader_to_merge, *_ = helpers.load_pdf(key="merge") 258 | 259 | if st.button( 260 | "➕ Merge PDFs", disabled=(not pdf_to_merge), use_container_width=True 261 | ): 262 | with PdfWriter() as merger: 263 | for file in (reader, reader_to_merge): 264 | merger.append(file) 265 | 266 | # TODO: Write to byte_stream 267 | merger.write("merged.pdf") 268 | 269 | pdf_viewer( 270 | open("merged.pdf", "rb").read(), 271 | height=250, 272 | width=300, 273 | ) 274 | st.download_button( 275 | "📥 Download merged PDF", 276 | data=open("merged.pdf", "rb"), 277 | mime="application/pdf", 278 | file_name="merged.pdf", 279 | use_container_width=True, 280 | ) 281 | # create a watermark 282 | with rcol.expander("©️ Add watermark"): 283 | if text_watermark := st.text_input( 284 | "Enter watermark text", 285 | placeholder="PDF-Workdesk Watermark", 286 | ): 287 | size_watermark = st.slider( 288 | "Font size", min_value=6, max_value=30, value=12 289 | ) 290 | lcol, rcol_inner = st.columns([1, 3]) 291 | color = lcol.color_picker("Color", "#F90004") 292 | transparency = rcol_inner.slider( 293 | "Opacity", min_value=0.0, max_value=1.0, value=0.8 294 | ) 295 | 296 | watermarked_pdf = helpers.watermark_pdf( 297 | pdf=pdf, 298 | stamp_label=text_watermark, 299 | stamp_size=size_watermark, 300 | stamp_color=color, 301 | stamp_transparency=transparency, 302 | ) 303 | pdf_viewer(watermarked_pdf, height=400, width=500) 304 | 305 | st.download_button( 306 | "📥 Download watermarked PDF", 307 | data=watermarked_pdf, 308 | mime="application/pdf", 309 | file_name="watermarked.pdf", 310 | use_container_width=True, 311 | ) 312 | 313 | with st.expander("🤏 Reduce PDF size"): 314 | # TODO: Add password back to converted PDF if original was protected 315 | st.caption("Will remove password if present") 316 | 317 | pdf_small = pdf 318 | 319 | lcol, mcol, rcol = st.columns(3) 320 | 321 | with lcol: 322 | remove_duplication = st.toggle( 323 | "Remove duplication", 324 | help=""" 325 | Some PDF documents contain the same object multiple times. 326 | For example, if an image appears three times in a PDF it could be embedded three times. 327 | Or it can be embedded once and referenced twice. 328 | **Note:** This option will not remove objects, rather it will use a reference to the original object for subsequent uses. 329 | """, 330 | ) 331 | 332 | remove_images = st.toggle( 333 | "Remove images", 334 | help="Remove images from the PDF. Will also remove duplication.", 335 | ) 336 | 337 | if remove_images or remove_duplication: 338 | pdf_small = helpers.remove_images( 339 | pdf, 340 | remove_images=remove_images, 341 | password=session_state.password, 342 | ) 343 | 344 | if st.toggle( 345 | "Reduce image quality", 346 | help=""" 347 | Reduce the quality of images in the PDF. Will also remove duplication. 348 | May not work for all cases. 349 | """, 350 | disabled=remove_images, 351 | ): 352 | quality = st.slider( 353 | "Quality", 354 | min_value=0, 355 | max_value=100, 356 | value=50, 357 | disabled=remove_images, 358 | ) 359 | pdf_small = helpers.reduce_image_quality( 360 | pdf_small, 361 | quality, 362 | password=session_state.password, 363 | ) 364 | 365 | if st.toggle( 366 | "Lossless compression", 367 | help="Compress PDF without losing quality", 368 | ): 369 | pdf_small = helpers.compress_pdf( 370 | pdf_small, password=session_state.password 371 | ) 372 | 373 | original_size = sys.getsizeof(pdf) 374 | reduced_size = sys.getsizeof(pdf_small) 375 | st.caption( 376 | f"Reduction: {100 - (reduced_size / original_size) * 100:.2f}%" 377 | ) 378 | 379 | with mcol: 380 | st.caption(f"Original size: {original_size / 1024:.2f} KB") 381 | helpers.preview_pdf( 382 | reader, 383 | pdf, 384 | key="other", 385 | password=session_state.password, 386 | ) 387 | with rcol: 388 | st.caption(f"Reduced size: {reduced_size / 1024:.2f} KB") 389 | helpers.preview_pdf( 390 | PdfReader(BytesIO(pdf_small)), 391 | pdf_small, 392 | key="other", 393 | password=session_state.password, 394 | ) 395 | st.download_button( 396 | "📥 Download smaller PDF", 397 | data=pdf_small, 398 | mime="application/pdf", 399 | file_name=f"{filename}_reduced.pdf", 400 | use_container_width=True, 401 | ) 402 | 403 | except Exception as e: 404 | st.error( 405 | f"""The app has encountered an error: 406 | `{e}` 407 | Please create an issue [here](https://github.com/SiddhantSadangi/pdf-workdesk/issues/new) 408 | with the below traceback""", 409 | icon="🥺", 410 | ) 411 | st.code(traceback.format_exc()) 412 | 413 | st.success( 414 | "[Star the repo](https://github.com/SiddhantSadangi/pdf-workdesk) to show your :heart:", 415 | icon="⭐", 416 | ) 417 | -------------------------------------------------------------------------------- /assets/sidebar.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Sidebar content 6 | 7 | 9 | 10 | 11 | 12 | 13 | 14 |
16 |

17 | v{VERSION}
18 | ⭐ the repo to be notified of updates

19 |
22 |
23 |

24 | Made with ❤️ by Siddhant Sadangi 25 |
26 | 27 | 28 |
29 |
30 |
35 | 36 | Sponsor me on GitHub
39 |
40 |
41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /packages.txt: -------------------------------------------------------------------------------- 1 | libjpeg-dev 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv export --no-hashes --format requirements-txt 3 | altair==5.5.0 4 | attrs==25.3.0 5 | beautifulsoup4==4.13.3 6 | blinker==1.9.0 7 | cachetools==5.5.2 8 | certifi==2025.1.31 9 | cffi==1.17.1 ; platform_python_implementation != 'PyPy' 10 | chardet==5.2.0 11 | charset-normalizer==3.4.1 12 | click==8.1.8 13 | colorama==0.4.6 ; sys_platform == 'win32' 14 | cryptography==44.0.2 15 | fire==0.7.0 16 | fonttools==4.56.0 17 | gitdb==4.0.12 18 | gitpython==3.1.44 19 | idna==3.10 20 | jinja2==3.1.6 21 | jsonschema==4.23.0 22 | jsonschema-specifications==2024.10.1 23 | lxml==5.3.1 24 | markupsafe==3.0.2 25 | narwhals==1.32.0 26 | numpy==2.2.4 27 | opencv-python-headless==4.11.0.86 28 | packaging==24.2 29 | pandas==2.2.3 30 | pdf2docx==0.5.8 31 | pdfminer-six==20250327 32 | pdfplumber==0.11.6 33 | pillow==11.1.0 34 | protobuf==5.29.4 35 | pyarrow==19.0.1 36 | pycparser==2.22 ; platform_python_implementation != 'PyPy' 37 | pydeck==0.9.1 38 | pymupdf==1.25.4 39 | pypdf==5.4.0 40 | pypdfium2==4.30.1 41 | python-dateutil==2.9.0.post0 42 | python-docx==1.1.2 43 | pytz==2025.2 44 | referencing==0.36.2 45 | reportlab==4.3.1 46 | requests==2.32.3 47 | rpds-py==0.24.0 48 | six==1.17.0 49 | smmap==5.0.2 50 | soupsieve==2.6 51 | st-social-media-links==0.1.4 52 | streamlit==1.44.0 53 | streamlit-pdf-viewer==0.0.21 54 | tenacity==9.0.0 55 | termcolor==2.5.0 56 | toml==0.10.2 57 | tornado==6.5.1 58 | typing-extensions==4.13.0 59 | tzdata==2025.2 60 | urllib3==2.3.0 61 | watchdog==6.0.0 ; sys_platform != 'darwin' 62 | -------------------------------------------------------------------------------- /utils/helpers.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import re 3 | from datetime import datetime 4 | from io import BytesIO 5 | from pathlib import Path 6 | from random import random 7 | from typing import Callable, Dict, Literal, Optional, Tuple, Union 8 | 9 | import pandas as pd 10 | import pdfplumber 11 | import requests 12 | import streamlit as st 13 | from pdf2docx import Converter 14 | from PIL import Image 15 | from pypdf import PdfReader, PdfWriter, Transformation 16 | from pypdf.errors import PdfReadError, PdfStreamError 17 | from reportlab.lib.pagesizes import letter 18 | from reportlab.pdfgen import canvas 19 | from streamlit import session_state 20 | from streamlit.runtime.uploaded_file_manager import UploadedFile 21 | from streamlit_pdf_viewer import pdf_viewer 22 | 23 | 24 | def select_pages(container, key: str): 25 | return container.text_input( 26 | "Pages to extract from?", 27 | placeholder="all", 28 | help=""" 29 | Format 30 | ------ 31 | **all:** all pages 32 | **2:** 2nd page 33 | **1-3:** pages 1 to 3 34 | **2,4:** pages 2 and 4 35 | **1-3,5:** pages 1 to 3 and 5""", 36 | key=key, 37 | ).lower() 38 | 39 | 40 | @st.cache_data 41 | def image_to_pdf(stamp_img: Union[Path, str]) -> PdfReader: 42 | img = Image.open(stamp_img) 43 | img_as_pdf = BytesIO() 44 | img.save(img_as_pdf, "pdf") 45 | return PdfReader(img_as_pdf) 46 | 47 | 48 | @st.cache_data 49 | def watermark_img( 50 | reader: PdfReader, 51 | stamp_img: UploadedFile, 52 | ) -> None: 53 | # Convert the image to a PDF 54 | stamp_pdf = image_to_pdf(stamp_img) 55 | 56 | # Then use the same stamp code from above 57 | stamp_page = stamp_pdf.pages[0] 58 | 59 | writer = PdfWriter() 60 | 61 | writer.append(reader) 62 | 63 | for content_page in writer.pages: 64 | content_page.merge_transformed_page( 65 | stamp_page, Transformation(), expand=True, over=False 66 | ) 67 | 68 | # TODO: Write to byte_stream 69 | with open("watermarked.pdf", "wb") as fp: 70 | writer.write(fp) 71 | 72 | 73 | def get_option(key: Literal["main", "merge"]) -> str: 74 | return st.radio( 75 | label="Upload a PDF, or load PDF from a URL", 76 | options=( 77 | "Upload a PDF ⬆️", 78 | "Load PDF from a URL 🌐", 79 | ), 80 | horizontal=True, 81 | help="PDFs are deleted from the server when you\n" 82 | "* upload another PDF, or\n" 83 | "* clear the file uploader, or\n" 84 | "* close the browser tab.", 85 | key=f"upload_{key}", 86 | ) 87 | 88 | 89 | def get_password(key: Literal["main", "merge"]) -> Optional[str]: 90 | password = st.text_input( 91 | "PDF Password", 92 | type="password", 93 | placeholder="Required if PDF is protected", 94 | key=f"password_{key}", 95 | ) 96 | return password if password != "" else None 97 | 98 | 99 | def upload_pdf( 100 | key: Literal["main", "merge"], password: Optional[str] 101 | ) -> Optional[Tuple[bytes, PdfReader]]: 102 | if file := st.file_uploader( 103 | label="Upload a PDF", 104 | type=["pdf"], 105 | key=f"file_{key}", 106 | ): 107 | session_state["file"] = file 108 | session_state["name"] = file.name 109 | pdf = file.getvalue() 110 | try: 111 | reader = PdfReader(BytesIO(pdf), password=password) 112 | except PdfReadError: 113 | reader = PdfReader(BytesIO(pdf)) 114 | return pdf, reader 115 | return None, None 116 | 117 | 118 | def load_pdf_from_url( 119 | key: Literal["main", "merge"], password: Optional[str] 120 | ) -> Optional[Tuple[bytes, PdfReader]]: 121 | url = st.text_input( 122 | "PDF URL", 123 | key=f"url_{key}", 124 | value="https://getsamplefiles.com/download/pdf/sample-1.pdf", 125 | ) 126 | 127 | @st.cache_data 128 | def _cached_get_url(url: str) -> requests.Response: 129 | return requests.get(url) 130 | 131 | if url != "": 132 | try: 133 | response = _cached_get_url(url) 134 | session_state["file"] = pdf = response.content 135 | session_state["name"] = url.split("/")[-1] 136 | try: 137 | reader = PdfReader(BytesIO(pdf), password=password) 138 | except PdfReadError: 139 | reader = PdfReader(BytesIO(pdf)) 140 | return pdf, reader 141 | except PdfStreamError: 142 | st.error("The URL does not seem to be a valid PDF file.", icon="❌") 143 | return None, None 144 | 145 | 146 | def load_pdf( 147 | key: Literal["main", "merge"] = "main", 148 | ) -> Optional[Tuple[bytes, PdfReader, str, bool]]: 149 | option = get_option(key) 150 | password = get_password(key) 151 | 152 | # Map options to functions 153 | option_functions: Dict[str, Callable[[str, str], Tuple[bytes, PdfReader]]] = { 154 | "Upload a PDF ⬆️": upload_pdf, 155 | "Load PDF from a URL 🌐": load_pdf_from_url, 156 | } 157 | 158 | if function := option_functions.get(option): 159 | pdf, reader = function(key, password) 160 | 161 | if pdf: 162 | preview_pdf( 163 | reader, 164 | pdf, 165 | key, 166 | password, 167 | ) 168 | return pdf, reader, password, reader.is_encrypted 169 | 170 | return None, None, "", False 171 | 172 | 173 | def handle_encrypted_pdf(reader: PdfReader, password: str, key: str) -> None: 174 | if password: 175 | session_state["decrypted_filename"] = f"unprotected_{session_state['name']}" 176 | decrypt_pdf( 177 | reader, 178 | password, 179 | filename=session_state["decrypted_filename"], 180 | ) 181 | pdf_viewer( 182 | f"unprotected_{session_state['name']}", 183 | height=600 if key == "main" else 250, 184 | key=str(random()), 185 | ) 186 | else: 187 | st.error("Password required", icon="🔒") 188 | 189 | 190 | def handle_unencrypted_pdf(pdf: bytes, key: str) -> None: 191 | pdf_viewer( 192 | pdf, 193 | height=600 if key == "main" else 250, 194 | key=str(random()), 195 | ) 196 | 197 | 198 | def display_metadata(reader: PdfReader) -> None: 199 | metadata = {"Number of pages": len(reader.pages)} 200 | for k in reader.metadata: 201 | value = reader.metadata[k] 202 | if is_pdf_datetime(value): 203 | value = convert_pdf_datetime(value) 204 | 205 | metadata[k.replace("/", "")] = value 206 | 207 | metadata = pd.DataFrame.from_dict(metadata, orient="index", columns=["Value"]) 208 | metadata.index.name = "Metadata" 209 | 210 | st.dataframe(metadata) 211 | 212 | 213 | def preview_pdf( 214 | reader: PdfReader, 215 | pdf: bytes = None, 216 | key: Literal["main", "other"] = "main", 217 | password: str = "", 218 | ) -> None: 219 | with contextlib.suppress(NameError): 220 | if key == "main": 221 | lcol, rcol = st.columns([2, 1]) 222 | with lcol.expander("📄 **Preview**", expanded=bool(pdf)): 223 | if reader.is_encrypted: 224 | handle_encrypted_pdf(reader, password, key) 225 | else: 226 | handle_unencrypted_pdf(pdf, key) 227 | 228 | with rcol.expander("🗄️ **Metadata**"): 229 | display_metadata(reader) 230 | elif reader.is_encrypted: 231 | handle_encrypted_pdf(reader, password, key) 232 | else: 233 | handle_unencrypted_pdf(pdf, key) 234 | 235 | 236 | @st.cache_data 237 | def is_pdf_datetime(s: str) -> bool: 238 | pattern = r"^D:\d{14}\+\d{2}\'\d{2}\'$" 239 | return bool(re.match(pattern, s)) 240 | 241 | 242 | @st.cache_data 243 | def convert_pdf_datetime(pdf_datetime: str) -> str: 244 | # Remove the 'D:' at the beginning 245 | pdf_datetime = pdf_datetime[2:] 246 | 247 | # Extract the date, time, and timezone components 248 | date_str = pdf_datetime[:8] 249 | time_str = pdf_datetime[8:14] 250 | tz_str = pdf_datetime[14:] 251 | 252 | return ( 253 | datetime.strptime(date_str + time_str, "%Y%m%d%H%M%S").strftime( 254 | "%Y-%m-%d %H:%M:%S " 255 | ) 256 | + tz_str 257 | ) 258 | 259 | 260 | @st.cache_data 261 | def parse_page_numbers(page_numbers_str): 262 | # Split the input string by comma or hyphen 263 | parts = page_numbers_str.split(",") 264 | 265 | # Initialize an empty list to store parsed page numbers 266 | parsed_page_numbers = [] 267 | 268 | # Iterate over each part 269 | for part in parts: 270 | # Remove any leading/trailing spaces 271 | part = part.strip() 272 | 273 | # If the part contains a hyphen, it represents a range 274 | if "-" in part: 275 | start, end = map(int, part.split("-")) 276 | parsed_page_numbers.extend(range(start, end + 1)) 277 | else: 278 | # Otherwise, it's a single page number 279 | parsed_page_numbers.append(int(part)) 280 | 281 | return [i - 1 for i in parsed_page_numbers] 282 | 283 | 284 | def extract_text( 285 | reader: PdfReader.pages, 286 | page_numbers_str: str = "all", 287 | mode: Literal["plain", "layout"] = "plain", 288 | ) -> str: 289 | text = "" 290 | 291 | if page_numbers_str == "all": 292 | for page in reader.pages: 293 | text = text + " " + page.extract_text(extraction_mode=mode) 294 | else: 295 | pages = parse_page_numbers(page_numbers_str) 296 | for page in pages: 297 | text = text + " " + reader.pages[page].extract_text() 298 | 299 | return text 300 | 301 | 302 | def extract_images(reader: PdfReader.pages, page_numbers_str: str = "all") -> str: 303 | images = {} 304 | if page_numbers_str == "all": 305 | for page in reader.pages: 306 | images |= {image.data: image.name for image in page.images} 307 | 308 | else: 309 | pages = parse_page_numbers(page_numbers_str) 310 | for page in pages: 311 | images.update( 312 | {image.data: image.name for image in reader.pages[page].images} 313 | ) 314 | 315 | return images 316 | 317 | 318 | def extract_tables(file, page_numbers_str): 319 | st.caption( 320 | "Adjust vertical and horizontal strategies for better extraction. Read details about the strategies [here](https://github.com/jsvine/pdfplumber?tab=readme-ov-file#table-extraction-strategies)." 321 | ) 322 | col0, col1 = st.columns(2) 323 | vertical_strategy = col0.selectbox( 324 | "Vertical strategy", 325 | ["lines", "lines_strict", "text"], 326 | index=2, 327 | ) 328 | horizontal_strategy = col1.selectbox( 329 | "Horizontal strategy", 330 | ["lines", "lines_strict", "text"], 331 | index=2, 332 | ) 333 | 334 | header = st.checkbox("Header") 335 | 336 | first_row_index = 1 if header else 0 337 | 338 | with pdfplumber.open( 339 | BytesIO(file) if isinstance(file, bytes) else file, 340 | password=session_state["password"], 341 | ) as table_pdf: 342 | if page_numbers_str == "all": 343 | for page in table_pdf.pages: 344 | for table in page.extract_tables( 345 | { 346 | "vertical_strategy": vertical_strategy, 347 | "horizontal_strategy": horizontal_strategy, 348 | } 349 | ): 350 | st.write( 351 | pd.DataFrame( 352 | table[first_row_index:], 353 | columns=table[0] if header else None, 354 | ) 355 | ) 356 | else: 357 | pages = parse_page_numbers(page_numbers_str) 358 | for page in pages: 359 | for page in table_pdf.pages[page : page + 1]: 360 | for table in page.extract_tables( 361 | { 362 | "vertical_strategy": vertical_strategy, 363 | "horizontal_strategy": horizontal_strategy, 364 | } 365 | ): 366 | st.write( 367 | pd.DataFrame( 368 | table[first_row_index:], 369 | columns=table[0] if header else None, 370 | ) 371 | ) 372 | 373 | 374 | def decrypt_pdf(reader: PdfReader, password: str, filename: str) -> None: 375 | reader.decrypt(password) 376 | 377 | writer = PdfWriter() 378 | 379 | for page in reader.pages: 380 | writer.add_page(page) 381 | 382 | with open(filename, "wb") as f: 383 | writer.write(f) 384 | 385 | 386 | @st.cache_data 387 | def remove_images(pdf: bytes, remove_images: bool, password: str) -> bytes: 388 | reader = PdfReader(BytesIO(pdf)) 389 | 390 | if reader.is_encrypted: 391 | reader.decrypt(password) 392 | 393 | writer = PdfWriter() 394 | 395 | for page in reader.pages: 396 | writer.add_page(page) 397 | 398 | writer.add_metadata(reader.metadata) 399 | 400 | if remove_images: 401 | writer.remove_images() 402 | 403 | bytes_stream = BytesIO() 404 | writer.write(bytes_stream) 405 | 406 | bytes_stream.seek(0) 407 | 408 | return bytes_stream.getvalue() 409 | 410 | 411 | def reduce_image_quality(pdf: bytes, quality: int, password: str) -> bytes: 412 | reader = PdfReader(BytesIO(pdf)) 413 | 414 | if reader.is_encrypted: 415 | reader.decrypt(password) 416 | 417 | writer = PdfWriter() 418 | 419 | for page in reader.pages: 420 | writer.add_page(page) 421 | 422 | writer.add_metadata(reader.metadata) 423 | 424 | for page in writer.pages: 425 | for img in page.images: 426 | img.replace(img.image, quality=quality) 427 | 428 | bytes_stream = BytesIO() 429 | writer.write(bytes_stream) 430 | 431 | bytes_stream.seek(0) 432 | 433 | return bytes_stream.getvalue() 434 | 435 | 436 | @st.cache_data 437 | def compress_pdf(pdf: bytes, password: str) -> bytes: 438 | reader = PdfReader(BytesIO(pdf)) 439 | 440 | if reader.is_encrypted: 441 | reader.decrypt(password) 442 | 443 | writer = PdfWriter(clone_from=reader) 444 | 445 | for page in writer.pages: 446 | page.compress_content_streams() # This is CPU intensive! 447 | 448 | bytes_stream = BytesIO() 449 | writer.write(bytes_stream) 450 | bytes_stream.seek(0) 451 | 452 | return bytes_stream.getvalue() 453 | 454 | 455 | @st.cache_data 456 | def convert_pdf_to_word(pdf): 457 | cv = Converter(stream=pdf, password=session_state.password) 458 | docx_stream = BytesIO() 459 | cv.convert(docx_stream, start=0, end=None) 460 | cv.close() 461 | 462 | docx_stream.seek(0) 463 | return docx_stream 464 | 465 | 466 | def hex_to_rgb(hex_color: str) -> Tuple[float, float, float]: 467 | """ 468 | Convert a hexadecimal color code to an RGB color tuple. 469 | 470 | Args: 471 | hex_color (str): The hexadecimal color code. 472 | 473 | Returns: 474 | Tuple[float, float, float]: The RGB color tuple 475 | """ 476 | hex_color = hex_color.lstrip("#") 477 | return tuple(int(hex_color[i : i + 2], 16) / 255 for i in (0, 2, 4)) 478 | 479 | 480 | def draw_watermark_grid( 481 | can, stamp_label: str, step_x: int, step_y: int, width: float, height: float 482 | ) -> None: 483 | """ 484 | Draw a grid of watermarks on the given canvas. 485 | 486 | Args: 487 | can (canvas.Canvas): The canvas to draw the watermarks on. 488 | stamp_label (str): The label to be displayed as the watermark. 489 | step_x (int): The horizontal spacing between watermarks. 490 | step_y (int): The vertical spacing between watermarks. 491 | width (float): The width of the canvas. 492 | height (float): The height of the canvas. 493 | 494 | Returns: 495 | None 496 | """ 497 | for x in range(-100, int(width) + 100, step_x): 498 | for y in range(-100, int(height) + 100, step_y): 499 | can.saveState() 500 | can.translate(x, y) 501 | can.rotate(45) 502 | can.drawCentredString(0, 0, stamp_label) 503 | can.restoreState() 504 | 505 | 506 | def merge_watermark_into_pdf(pdf: bytes, watermark: BytesIO) -> bytes: 507 | """ 508 | Merge a watermark into a PDF document. 509 | 510 | Args: 511 | pdf (bytes): The PDF document to merge the watermark into. 512 | watermark (BytesIO): The watermark to merge into the PDF. 513 | 514 | Returns: 515 | bytes: The merged PDF document. 516 | """ 517 | writer = PdfWriter() 518 | reader = PdfReader(BytesIO(pdf)) 519 | watermark_reader = PdfReader(watermark) 520 | watermark_page = watermark_reader.pages[0] 521 | for page in reader.pages: 522 | page.merge_page(watermark_page) 523 | writer.add_page(page) 524 | with BytesIO() as fp: 525 | writer.write(fp) 526 | fp.seek(0) 527 | return fp.read() 528 | 529 | 530 | def create_watermark_canvas( 531 | stamp_label: str, stamp_size: int, stamp_color: str, stamp_transparency: float 532 | ) -> BytesIO: 533 | """ 534 | Create a watermark canvas with the given label, size, color, and transparency. 535 | 536 | Args: 537 | stamp_label (str): The label to be displayed as the watermark. 538 | stamp_size (int): The font size of the watermark. 539 | stamp_color (str): The color of the watermark in hexadecimal format. 540 | stamp_transparency (float): The transparency of the watermark. 541 | 542 | Returns: 543 | BytesIO: A BytesIO object containing the watermark canvas. 544 | """ 545 | packet = BytesIO() 546 | can = canvas.Canvas(packet, pagesize=letter) 547 | can.setFont("Helvetica", stamp_size) 548 | color = hex_to_rgb(stamp_color) 549 | can.setFillColorRGB(*color) 550 | can.setFillAlpha(stamp_transparency) 551 | can.saveState() 552 | draw_watermark_grid( 553 | can, stamp_label, step_x=150, step_y=100, width=letter[0], height=letter[1] 554 | ) 555 | can.save() 556 | packet.seek(0) 557 | return packet 558 | 559 | 560 | @st.cache_data 561 | def watermark_pdf( 562 | pdf: bytes, 563 | stamp_label: str, 564 | stamp_size: int, 565 | stamp_color: str, 566 | stamp_transparency: float, 567 | ) -> bytes: 568 | watermark = create_watermark_canvas( 569 | stamp_label, stamp_size, stamp_color, stamp_transparency 570 | ) 571 | return merge_watermark_into_pdf(pdf, watermark) 572 | -------------------------------------------------------------------------------- /utils/init_session_states.py: -------------------------------------------------------------------------------- 1 | from streamlit import session_state 2 | 3 | 4 | def init(): 5 | session_state["decrypted_filename"] = ( 6 | None 7 | if "decrypted_filename" not in session_state 8 | else session_state["decrypted_filename"] 9 | ) 10 | session_state["password"] = ( 11 | "" if "password" not in session_state else session_state["password"] 12 | ) 13 | session_state["is_encrypted"] = ( 14 | False if "is_encrypted" not in session_state else session_state["is_encrypted"] 15 | ) 16 | 17 | 18 | if __name__ == "__main__": 19 | init() 20 | -------------------------------------------------------------------------------- /utils/page_config.py: -------------------------------------------------------------------------------- 1 | from streamlit import set_page_config 2 | 3 | from utils import version 4 | 5 | 6 | def set(): 7 | set_page_config( 8 | page_title="PDF WorkDesk", 9 | page_icon="📄", 10 | menu_items={ 11 | "About": f"PDF WorkDesk v{version.__version__} " 12 | "\nDeveloper contact: [Siddhant Sadangi](mailto:siddhant.sadangi@gmail.com)", 13 | "Report a Bug": "https://github.com/SiddhantSadangi/pdf-workdesk/issues/new", 14 | "Get help": None, 15 | }, 16 | layout="wide", 17 | ) 18 | 19 | 20 | if __name__ == "__main__": 21 | set() 22 | -------------------------------------------------------------------------------- /utils/render_sidebar.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from st_social_media_links import SocialMediaIcons 3 | 4 | from utils.version import __version__ 5 | 6 | 7 | def render(): 8 | with st.sidebar: 9 | with st.expander("✅ Supported operations"): 10 | st.write( 11 | "* Upload from disk/URL\n" 12 | "* Preview content/metadata\n" 13 | "* Extract text/images/tables\n" 14 | "* Convert PDF to Word\n" 15 | "* Add/remove password\n" 16 | "* Rotate/resize PDF\n" 17 | "* Merge PDFs\n" 18 | "* Reduce PDF size\n" 19 | ) 20 | 21 | with open("assets/sidebar.html", "r", encoding="UTF-8") as sidebar_file: 22 | sidebar_html = sidebar_file.read().replace("{VERSION}", __version__) 23 | 24 | st.components.v1.html(sidebar_html, height=290) 25 | 26 | st.html( 27 | """ 28 |
29 |
30 | Share the ❤️ on social media 31 |
""" 32 | ) 33 | 34 | social_media_links = [ 35 | "https://www.facebook.com/sharer/sharer.php?kid_directed_site=0&sdk=joey&u=https%3A%2F%2Fpdfworkdesk.streamlit.app%2F&display=popup&ref=plugin&src=share_button", 36 | "https://www.linkedin.com/sharing/share-offsite/?url=https%3A%2F%2Fpdfworkdesk.streamlit.app%2F", 37 | "https://x.com/intent/tweet?original_referer=https%3A%2F%2Fpdfworkdesk.streamlit.app%2F&ref_src=twsrc%5Etfw%7Ctwcamp%5Ebuttonembed%7Ctwterm%5Eshare%7Ctwgr%5E&text=Check%20out%20this%20open-source%20PDF-editing%20Streamlit%20app%21&url=https%3A%2F%2Fpdfworkdesk.streamlit.app%2F", 38 | ] 39 | 40 | social_media_icons = SocialMediaIcons( 41 | social_media_links, colors=["lightgray"] * len(social_media_links) 42 | ) 43 | 44 | social_media_icons.render(sidebar=True) 45 | 46 | st.html( 47 | """ 48 |
49 |
50 | 51 | Creative Commons License 53 |

54 | This work is licensed under a Creative Commons 55 | Attribution-NonCommercial-ShareAlike 4.0 International License.
56 | You can modify and build upon this work non-commercially. All derivatives should be 57 | credited to Siddhant Sadangi and 58 | be licenced under the same terms. 59 |
60 | """ 61 | ) 62 | 63 | 64 | if __name__ == "__main__": 65 | render() 66 | -------------------------------------------------------------------------------- /utils/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.7.1" 2 | --------------------------------------------------------------------------------