├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── pdfparser ├── __init__.py ├── __main__.py ├── custom_error.py ├── main_operations.py ├── user_choice.py ├── user_inputs.py └── utilities.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | bin/ 28 | include/ 29 | 30 | 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | 110 | ##my files 111 | check.py 112 | *.pdf 113 | deletes/ 114 | sorted/ 115 | splits/ 116 | merged/ 117 | pyvenv.cfg 118 | .DS_Store 119 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Udit Vashisht 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uditvashisht/sg-pdfparser/4c1f55e221212d54aa4a29e4a5334557f6dc66ac/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SaralGyaan PDF Parser 2 | SaralGyaan PDF Parser is a command-line PDF parsing tool which allows you to:- 3 | 1. Delete Pages from a PDF 4 | 2. Merge PDFs 5 | 3. Sort Pages of a PDF 6 | 4. Split a PDF 7 | ## Installation 8 | You can install SaralGyaan PDF Parser from [PyPI](https://pypi.org/project/sg-pdfparser/): 9 | ```pip install sg-pdfparser``` 10 | The PDF Parser supports Python 3.6 and above. 11 | ## How to use? 12 | The SaralGyaan PDF Parser is a command line application, named pdfparser. To start it you can simply open the terminal, go to the folder containing the PDF file(s) to be parsed and call the program:- 13 | ``` 14 | $ pdfparser 15 | Welcome to PDF Parser 16 | 17 | What do you want to do? 18 | 19 | 1. Delete Pages from a PDF 20 | 2. Merge PDFs 21 | 3. Sort Pages of a PDF 22 | 4. Split a PDF 23 | 24 | Press ctrl + C to exit. 25 | Enter your choice (1-4): 26 | ``` 27 | ## Delete Pages from a PDF 28 | If you select the option to delete pages, it will ask for the filename followed by '.pdf' 29 | ``` 30 | Enter your choice (1-4): 1 31 | Delete Pages from a PDF 32 | Enter the name of file with extension(.pdf) 33 | ``` 34 | Make sure the pdf file exist at the location and once you provide a valid pdf file, it will give you two options:- 35 | ``` 36 | What do you want to do? 37 | 1. Delete specific pages 38 | 2. Keep Specific pages 39 | ``` 40 | Both the options accept comma separated values of pages or page ranges or both e.g. 1, 2, 3-5 or 1-2 or 2-3, 4-6. 41 | One, thing you need to know that if you use Keep specific pages and change the order like 1-2, 6-4, then it will re-arrange the pages too. 42 | 43 | ## Merge PDFs 44 | This options accepts comma separated file names and it will merge the files in the order, it is provided as an input. 45 | 46 | ## Sort pages of a PDF 47 | Sort pages, give you three options 48 | ``` 49 | What do you want to do? 50 | 1. Reverse order of all the pages 51 | 2. Swap Pages 52 | 3. Move certain pages to a specific index 53 | ``` 54 | 1. The first one, will simply reverse the order of the pages. 55 | 2. The second one will swap two pages, you can input multiple or single swaps e.g. 1-3, 2-7, 8-9, etc. 56 | 3. The third one will move the page to a certain page number (not index). It also accepts comma separated values. So 21-2 will move page number 21 to 2 and hence shifting the rest of the page to right. 57 | 58 | ## Split a PDF 59 | This gives you two options:- 60 | ``` 61 | What do you want to do? 62 | 1. Split all the pages 63 | 2. Split specific pages 64 | ``` 65 | 1. The first option will make n splits for n-paged PDF file. 66 | 2. The second one will split the pdf into the ranges or pages as provided. e.g. 1, 3, 9-22, will give three split files first page, third page and pages from nine to twenty two. 67 | 68 | ## Video Usage Guide 69 | You can also check out our [Usage Guide](https://youtu.be/BenY3DeEaf4) on Youtube. 70 | 71 | ## License 72 | 73 | © 2020 Udit Vashisht 74 | 75 | This repository is licensed under the MIT license. See LICENSE for details. 76 | -------------------------------------------------------------------------------- /pdfparser/__init__.py: -------------------------------------------------------------------------------- 1 | # Version of sg-pdfparser 2 | 3 | __version__ ="1.0.2" 4 | -------------------------------------------------------------------------------- /pdfparser/__main__.py: -------------------------------------------------------------------------------- 1 | """SaralGyaan PDF Parser 2 | Installation: 3 | ------------ 4 | $ pip install sg-pdfparser 5 | 6 | Usage: 7 | ------ 8 | $ pdfparser 9 | 10 | Available Options: 11 | 1. Delete Pages from a PDF 12 | 2. Merge PDFs 13 | 3. Sort Pages of a PDF 14 | 4. Split a PDF 15 | 16 | Inputs: 17 | 18 | Top Level: 19 | 20 | Single PDF file with file extension (.pdf) for option 1, 3 & 4 and atleast two PDF files (separated by comma) with file extension (.pdf) for option 2. 21 | 22 | Each Function: 23 | 24 | Delete Pages from a PDF: 25 | 26 | Page numbers or page ranges separated by comma or both e.g. 1, 3, 4 or 1-2, 4 or 1-7, 9-11 27 | 28 | Merge PDFs: 29 | 30 | Atleast two PDF files (separated by comma) with file extension(.pdf). The PDFs will be merged in the order of input. 31 | 32 | Sort Pages of PDF: 33 | 34 | Reverse the order will reverse the order of the pages. 35 | 36 | A hyphen separated combination of page numbers. Can add multiple separated by comma e.g. 1-9, 2-6 37 | 38 | Swap:- Will swap page number 1 & 9 and 2 & 6. 39 | Move:- Will move page number 1 to Page 9 and 2 to 6 moving the rest of the pages to the right. 40 | 41 | Split a PDF: 42 | 43 | Split all will split n-paged files into n split files one page in each. 44 | 45 | Page numbers or page ranges separated by comma or both e.g. 1, 3, 4 or 1-2, 4 or 1-7, 9-11 46 | 47 | 1-2, 4 will make two PDF files. 48 | 49 | 50 | Contact: 51 | -------- 52 | - admin@saralgyaan.com 53 | 54 | More information is available at: 55 | - https://pypi.org/project/sg-pdfparser/ 56 | - https://github.com/uditvashisht/sg-pdfparser 57 | 58 | Version: 59 | --------- 60 | pdfparser v1.0.0 61 | """ 62 | # Inbuilt library imports 63 | import os 64 | import sys 65 | # Import top level functions 66 | from pdfparser.main_operations import pdf_delete, pdf_merge, pdf_sort, pdf_split 67 | from pdfparser.user_choice import UserChoice 68 | # Import colorama for color effects 69 | from colorama import Fore, Back, Style, init 70 | init(autoreset=True) 71 | 72 | 73 | ALL_OPERATIONS = [pdf_delete, pdf_merge, pdf_sort, pdf_split] 74 | 75 | welcome_choices = UserChoice(4) 76 | 77 | 78 | def display_welcome_message(): 79 | """ Prints the welcome screen with all the top level options of PDF Parsing 80 | 81 | Parameters: 82 | Input : int (1-4) 83 | """ 84 | while True: 85 | try: 86 | print("\nWelcome to PDF Parser\n") 87 | print("What do you want to do?\n") 88 | print("""1. Delete Pages from a PDF\n2. Merge PDFs\n3. Sort Pages of a PDF\n4. Split a PDF\n""") 89 | print("Press ctrl + c to exit.") 90 | 91 | welcome_choices.run_block(ALL_OPERATIONS 92 | ) 93 | # Keyboard interrupt 94 | except KeyboardInterrupt: 95 | print(Fore.RED + "\nExiting...") 96 | sys.exit() 97 | 98 | 99 | def main(): 100 | display_welcome_message() 101 | 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /pdfparser/custom_error.py: -------------------------------------------------------------------------------- 1 | class Error(Exception): 2 | """Base class for other exceptions""" 3 | pass 4 | 5 | 6 | class ChoiceNotInOptions(Error): 7 | """If the choice is not in option""" 8 | pass 9 | 10 | 11 | class InvalidSelection(Error): 12 | """If the selected pages are invalid""" 13 | pass 14 | 15 | 16 | class ChoiceNotInRange(Error): 17 | """If the page number is not in range""" 18 | pass 19 | 20 | 21 | class EnterCombination(Error): 22 | """If user has entered only one value for swaps""" 23 | pass 24 | 25 | 26 | class InputMultipleFilesError(Error): 27 | """ If user passes only one file in merge functions. 28 | """ 29 | pass 30 | -------------------------------------------------------------------------------- /pdfparser/main_operations.py: -------------------------------------------------------------------------------- 1 | from PyPDF2 import PdfFileReader, PdfFileWriter 2 | from pdfparser.user_inputs import file_input 3 | from pdfparser.user_choice import UserChoice 4 | from pdfparser.custom_error import * 5 | from pdfparser.utilities import * 6 | import os 7 | import re 8 | from colorama import Fore, Back, Style, init 9 | import datetime 10 | 11 | # Regex Match for checking the user input for page ranges 12 | INPUT_MATCH = '^[0-9][0-9,-]*[0-9]*$' 13 | CURRENT_DIRECTORY = os.getcwd() 14 | 15 | timestamp = datetime.datetime.now().strftime("%d%m%Y_%H%M%S") 16 | 17 | 18 | def user_range_input(message, number_of_pages, func, final_list, others=None, sorts=False): 19 | """ This function takes the user's range input and then runs the next function 20 | 21 | Parameters: 22 | ----------- 23 | message: string 24 | Adds necessary message in Input Prompt Text 25 | number_of_pages : int 26 | No. of pages in the PDF file. 27 | func : function 28 | Next function to be run 29 | final_list: list 30 | Final list of user input after processing 31 | others: list 32 | Handles the kind of the function we are running e.g. deleting or keeping, swapping or moving, etc. 33 | sort : Boolean 34 | Tells which operation is we are doing 35 | 36 | Runs: 37 | Function func with given parameters 38 | """ 39 | 40 | while True: 41 | try: 42 | if sorts: 43 | user_input = input(f'Please enter combination of pages (separated by hyphen[-]) [{message}]\n') 44 | else: 45 | user_input = input(f'Please enter the page numbers or range [{message}]\n') 46 | # Remove extra white spaces 47 | user_input = str(user_input).replace(' ', '') 48 | # Remove comma at the beginning or end 49 | user_input = user_input.strip(',') 50 | # Run RE match to check input 51 | if not re.match(INPUT_MATCH, user_input): 52 | raise InvalidSelection 53 | # Create final list on the basis of user input 54 | final_list = process_selections(user_input, sorts=sorts) 55 | # Check if pages are in range 56 | check_pages_in_range(final_list, number_of_pages) 57 | 58 | if others is None: 59 | exec(f'{func}({final_list})') 60 | else: 61 | exec(f'{func}({final_list}, "{others[0]}", {others[1]})') 62 | break 63 | 64 | except InvalidSelection: 65 | print(Fore.RED + "\nInvalid Selection- Allowed options are 1,2,3 or 1-2, 3 or 1, or 1-3\n") 66 | except ChoiceNotInRange: 67 | print(Fore.RED + f'\nInvalid Page Number- Choose from pages 1 to {number_of_pages}\n') 68 | except EnterCombination: 69 | print(Fore.RED + f'\nEnter the combination of page numbers separated by hyphen(-)\n') 70 | 71 | 72 | def write_pdf(page_number, pdf, to_directory, file_name): 73 | """This function writes a single page to the pdf 74 | 75 | Parameters: 76 | ----------- 77 | page_number: int 78 | Single page number to be written 79 | pdf : PyPDFs PDF element 80 | to_directory: str 81 | The directory where the output file will be saved. 82 | file_name : str 83 | Name of the file being processed 84 | 85 | Returns: 86 | -------- 87 | A PDF file. 88 | """ 89 | pdf_writer = PdfFileWriter() 90 | pdf_writer.addPage(pdf.getPage(page_number - 1)) 91 | output = f'{to_directory}/{os.path.splitext(file_name)[0].replace(" ","")}_{page_number}_{timestamp}.pdf' 92 | with open(output, 'wb') as output_pdf: 93 | pdf_writer.write(output_pdf) 94 | 95 | 96 | def write_multiple_pages_pdf(list_of_pages, pdf, to_directory, file_name): 97 | """This function writes a range of pages to pdf file like 1-5 (1 to 5) and 3-1 (3 to 1) 98 | 99 | Parameters: 100 | ----------- 101 | list_of_pages: list 102 | A list containing range of pages e.g. [1-5] 103 | pdf : PyPDFs PDF element 104 | to_directory: str 105 | The directory where the output file will be saved. 106 | file_name : str 107 | Name of the file being processed 108 | 109 | Returns: 110 | -------- 111 | A PDF file. 112 | """ 113 | # if list is [5-1] 114 | if list_of_pages[0] > list_of_pages[-1]: 115 | page_range = range(list_of_pages[0], list_of_pages[-1] - 1, -1) 116 | # if list is [1-5] 117 | else: 118 | page_range = range(list_of_pages[0], list_of_pages[-1] + 1) 119 | 120 | pdf_writer = PdfFileWriter() 121 | output = f'{to_directory}/{os.path.splitext(file_name)[0].replace(" ","")}_{"_".join([str(i) for i in list_of_pages])}_{timestamp}.pdf' 122 | for item in page_range: 123 | pdf_writer.addPage(pdf.getPage(item - 1)) 124 | with open(output, 'wb') as output_pdf: 125 | pdf_writer.write(output_pdf) 126 | 127 | 128 | def do_the_splits(final_list): 129 | """This function creates the split pdf files from the final list of input 130 | Parameters 131 | ---------- 132 | final_list : list 133 | A processed and valid list containing page numbers or ranges or both. 134 | 135 | Returns: 136 | 137 | Split PDF files 138 | """ 139 | 140 | print("Splitting...\n") 141 | to_directory = os.path.join(CURRENT_DIRECTORY, 'splits/') 142 | if not os.path.isdir(to_directory): 143 | os.mkdir(to_directory) 144 | 145 | for item in final_list: 146 | if not isinstance(item, list): 147 | write_pdf(item, single_pdf_element_split, to_directory, file_to_be_split) 148 | else: 149 | write_multiple_pages_pdf(item, single_pdf_element_split, to_directory, file_to_be_split) 150 | print(Fore.GREEN + f'{len(final_list)} split files saved in directory {to_directory}') 151 | 152 | 153 | def write_pdf_for_pages(list_of_pages, pdf, output): 154 | pdf_writer = PdfFileWriter() 155 | """This write pdf for list of pages [1, 3, 8] unlike write_multiple_pages_pdf which takes range. 156 | Parameters: 157 | ----------- 158 | list_of_pages: list 159 | A list containing pages e.g. [1, 3, 8] 160 | pdf : PyPDFs PDF element 161 | output: str 162 | Name of the output file 163 | 164 | Returns: 165 | -------- 166 | A PDF file. 167 | """ 168 | for item in list_of_pages: 169 | pdf_writer.addPage(pdf.getPage(item - 1)) 170 | with open(output, 'wb') as output_pdf: 171 | pdf_writer.write(output_pdf) 172 | 173 | 174 | def delete_pages(list_of_pages, file, to_directory, deleting): 175 | """ This pages deletes/keeps pages of pdf and create the new pdf 176 | 177 | Parameters: 178 | ---------- 179 | list_of_pages : list 180 | list of pages to be kept or deleted 181 | file : str 182 | Name of the PDF file 183 | to_directory: str 184 | Directory where the new pdf file will be saved 185 | deleting : boolean 186 | If True, it deletes 187 | else keeps 188 | Returns: 189 | -------- 190 | A PDF file. 191 | """ 192 | 193 | total_pages = list(range(1, number_of_pages_delete + 1)) 194 | if deleting: 195 | keep_pages = [item for item in total_pages if item not in list_of_pages] 196 | else: 197 | keep_pages = list_of_pages 198 | output = f'{to_directory}/{os.path.splitext(file)[0].replace(" ","")}_deleted_{list_of_pages[0]}_{list_of_pages[-1]}_{timestamp}.pdf' 199 | write_pdf_for_pages(keep_pages, single_pdf_element_delete, output) 200 | 201 | 202 | def do_the_deletes(final_list, msg, deleting): 203 | """This function processes the final_list (nested) and further flatten it and runs delete_pages function to create the final output pdf 204 | Parameters: 205 | ---------- 206 | final_list : list 207 | List input by the user (can be nested) 208 | msg : str 209 | deleting of keeping 210 | deleting : boolean 211 | if true deleting 212 | else keeping 213 | Runs: 214 | delete_pages 215 | """ 216 | print(f'{msg} specific pages...\n') 217 | to_directory = os.path.join(CURRENT_DIRECTORY, 'deletes/') 218 | if not os.path.isdir(to_directory): 219 | os.mkdir(to_directory) 220 | list_of_user_choices = [] 221 | for item in final_list: 222 | if not isinstance(item, list): 223 | list_of_user_choices.append(item) 224 | else: 225 | if item[0] < item[-1]: 226 | list_of_user_choices += list(range(item[0], item[-1] + 1)) 227 | else: 228 | list_of_user_choices += range(item[0], item[-1] - 1, -1) 229 | list_of_user_choices = sorted(list(set(list_of_user_choices))) 230 | delete_pages(list_of_user_choices, file_to_be_deleted, to_directory, deleting) 231 | print(Fore.GREEN + f'File after {msg.lower()} the pages have been saved in {to_directory}') 232 | 233 | 234 | def split_options(all=True): 235 | """ 236 | This function runs all the three split functions 237 | 238 | Parameters: 239 | ---------- 240 | all : boolean 241 | If true it splits all otherwise ask for user_input 242 | 243 | Returns: 244 | If all = True returns n split pages for n paged pdf 245 | 246 | Runs: 247 | If all = False,runs user_range_input to take user's input 248 | 249 | """ 250 | to_directory = os.path.join(CURRENT_DIRECTORY, 'splits/') 251 | if not os.path.isdir(to_directory): 252 | os.mkdir(to_directory) 253 | if all: 254 | for page in range(1, number_of_pages_split + 1): 255 | write_pdf(page, single_pdf_element_split, to_directory, file_to_be_split) 256 | print(Fore.GREEN + f'{number_of_pages_split} split files saved in directory {to_directory}') 257 | else: 258 | print("Enter the pages number(s) separated by comma or range of pages (1-2) or both\n") 259 | user_range_input("To be Split", number_of_pages_split, func="do_the_splits", final_list=None) 260 | 261 | 262 | def delete_options(deleting=True): 263 | """ This function gives all the delete options to the user. 264 | Parameters: 265 | ---------- 266 | deleting: boolean 267 | If true, deletes the pages 268 | else keeps the pages 269 | 270 | Runs : 271 | user_range_input to take the user input 272 | 273 | """ 274 | if deleting: 275 | msg = "To be Deleted" 276 | msg_2 = "Deleting" 277 | others = [msg_2, "deleting = True"] 278 | else: 279 | msg = "To be Kept" 280 | msg_2 = "Keeping" 281 | others = [msg_2, "deleting = False"] 282 | 283 | print(f'\nEnter the pages number(s)[{msg}] separated by comma or range of pages(1-2) or both\n') 284 | 285 | user_range_input(msg, number_of_pages_delete, func="do_the_deletes", others=others, final_list=None) 286 | 287 | 288 | def swap_move(final_list, msg, swap): 289 | """ This functions performs the all the sort options like swap, move 290 | Parameters: 291 | final_list : list 292 | Processed input list 293 | msg : str 294 | Message to be added in print statements 295 | swap : boolean 296 | If True, swaps 297 | else, moves 298 | 299 | Returns: 300 | Swapped or moved PDF file. 301 | """ 302 | to_directory = os.path.join(CURRENT_DIRECTORY, 'sorted/') 303 | if not os.path.isdir(to_directory): 304 | os.mkdir(to_directory) 305 | list_of_all_pages = list(range(1, number_of_pages_sort + 1)) 306 | temp_list = list_of_all_pages.copy() 307 | if swap: 308 | for item in final_list: 309 | temp_list[item[0] - 1] = list_of_all_pages[item[-1] - 1] 310 | temp_list[item[1] - 1] = list_of_all_pages[item[0] - 1] 311 | message = "swapped" 312 | else: 313 | for item in final_list: 314 | temp_list.remove(list_of_all_pages[item[0] - 1]) 315 | temp_list.insert(item[1] - 1, list_of_all_pages[item[0] - 1]) 316 | message = "moved" 317 | output = f'{to_directory}/{os.path.splitext(file_to_be_sorted)[0].replace(" ","")}_{message}_{timestamp}.pdf' 318 | write_pdf_for_pages(temp_list, single_pdf_element_sort, output) 319 | print(Fore.GREEN + f'File with {message} pages have been saved in {to_directory}') 320 | 321 | 322 | def swap_options(reverse=False, swap=False, move=False): 323 | """ This function gives all the swap options to the user. 324 | Parameters: 325 | ---------- 326 | reverse: boolean 327 | If true, it reverses the orders of the pages 328 | swap: boolean 329 | If true, it swaps the pages 330 | move: boolean 331 | If true, it moves the page to desired page number. 332 | 333 | Runs : 334 | user_range_input to take the user input 335 | 336 | """ 337 | to_directory = os.path.join(CURRENT_DIRECTORY, 'sorted/') 338 | if not os.path.isdir(to_directory): 339 | os.mkdir(to_directory) 340 | if reverse: 341 | print("Reversing the order of pages...\n") 342 | output = f'{to_directory}/{os.path.splitext(file_to_be_sorted)[0].replace(" ","")}_reversed_{timestamp}.pdf' 343 | list_of_pages = list(range(number_of_pages_sort, 0, -1)) 344 | write_pdf_for_pages(list_of_pages, single_pdf_element_sort, output) 345 | print(Fore.GREEN + f'File with pages in the reversed order has been saved in {to_directory}') 346 | else: 347 | if swap: 348 | msg_1 = "swap" 349 | msg_2 = "To be swapped" 350 | func = "swap_move" 351 | others = [msg_2, "swap = True"] 352 | else: 353 | msg_1 = "move" 354 | msg_2 = "To be moved" 355 | func = "swap_move" 356 | others = [msg_2, "swap = False"] 357 | print(f'Enter the pages number(s) separated by - to {msg_1} their positions. For mulitple {msg_1}s, separate them with comma\n') 358 | user_range_input(msg_2, number_of_pages_sort, func=func, others=others, final_list=None, sorts=True) 359 | 360 | 361 | # Create class UserChoice for each top level function 362 | split_choices = UserChoice(2) 363 | delete_choices = UserChoice(2) 364 | sort_choices = UserChoice(3) 365 | # create a list of all second level functions 366 | split_functions = [split_options] 367 | delete_functions = [delete_options] 368 | sort_functions = [swap_options] 369 | 370 | 371 | def pdf_split(): 372 | """ This function gives the option of all the split functions to the user 373 | 374 | Runs 375 | split_choices 376 | """ 377 | global file_to_be_split 378 | global single_pdf_element_split 379 | global number_of_pages_split 380 | print("\nSplit a PDF\n") 381 | file_to_be_split = file_input() 382 | single_pdf_element_split = PdfFileReader(file_to_be_split) 383 | number_of_pages_split = single_pdf_element_split.getNumPages() 384 | print(Fore.GREEN + f'\nYour file {file_to_be_split} has {number_of_pages_split} page(s).\n') 385 | print("""What do you want to do?\n\n 1. Split all the pages\n 2. Split specific pages\n""") 386 | split_choices.run_block(split_functions, splits=True) 387 | 388 | 389 | def pdf_delete(): 390 | """ This function gives the option of all the delete functions to the user 391 | 392 | Runs 393 | delete_choices 394 | """ 395 | print("\nDelete Pages from a PDF\n") 396 | global file_to_be_deleted 397 | global single_pdf_element_delete 398 | global number_of_pages_delete 399 | file_to_be_deleted = file_input() 400 | single_pdf_element_delete = PdfFileReader(file_to_be_deleted) 401 | number_of_pages_delete = single_pdf_element_delete.getNumPages() 402 | print(f'\nYour file {file_to_be_deleted} has {number_of_pages_delete} page(s).\n') 403 | print("""What do you want to do?\n\n 1. Delete specific pages\n 2. Keep Specific pages\n""") 404 | delete_choices.run_block(delete_functions, deletes=True) 405 | 406 | 407 | def pdf_merge(): 408 | """ 409 | This function runs the merge functions and returns the merged file 410 | 411 | """ 412 | print("\nMerge PDFs\n") 413 | print("PDFs will be merged in the same order as in input.") 414 | files = file_input(single_file=False) 415 | to_directory = os.path.join(CURRENT_DIRECTORY, 'merged/') 416 | if not os.path.isdir(to_directory): 417 | os.mkdir(to_directory) 418 | pdf_writer = PdfFileWriter() 419 | output = f'{to_directory}/{"_".join([os.path.splitext(file)[0].replace(" ", "") for file in files])}_{timestamp}.pdf' 420 | for file in files: 421 | pdf_reader = PdfFileReader(file) 422 | for page in range(pdf_reader.getNumPages()): 423 | pdf_writer.addPage(pdf_reader.getPage(page)) 424 | with open(output, 'wb') as output_file: 425 | pdf_writer.write(output_file) 426 | 427 | print(Fore.GREEN + f'The merged file has been saved in {to_directory}') 428 | 429 | 430 | def pdf_sort(): 431 | """ This function gives the option of all the sort functions to the user 432 | 433 | Runs 434 | sort_choices 435 | """ 436 | 437 | print("Sort Pages of a PDF\n") 438 | global file_to_be_sorted 439 | global single_pdf_element_sort 440 | global number_of_pages_sort 441 | file_to_be_sorted = file_input() 442 | single_pdf_element_sort = PdfFileReader(file_to_be_sorted) 443 | number_of_pages_sort = single_pdf_element_sort.getNumPages() 444 | print(f'\nYour file {file_to_be_sorted} has {number_of_pages_sort} page(s).\n') 445 | print("""What do you want to do?\n\n 1. Reverse order of all the pages\n 2. Swap Pages\n 3. Move certain pages to a specific index\n""") 446 | sort_choices.run_block(sort_functions, sorts=True) 447 | -------------------------------------------------------------------------------- /pdfparser/user_choice.py: -------------------------------------------------------------------------------- 1 | from pdfparser.custom_error import * 2 | from colorama import Fore, Back, Style, init 3 | init(autoreset=True) 4 | 5 | 6 | class UserChoice: 7 | """ This class creates a UserChoice for each function 8 | """ 9 | 10 | def __init__(self, number_of_choices): 11 | self.number_of_choices = number_of_choices 12 | 13 | def run_block(self, custom_functions, deletes=False, splits=False, sorts=False): 14 | """ 15 | This one runs the prompt for taking user's input for each choice and then runs the necessary function.""" 16 | while True: 17 | try: 18 | user_choice = int(input(f'Enter your choice (1-{self.number_of_choices}): ')) 19 | 20 | if user_choice not in list(range(1, self.number_of_choices + 1)): 21 | raise ChoiceNotInOptions 22 | 23 | if deletes: 24 | if user_choice == 1: 25 | custom_functions[0](deleting=True) 26 | elif user_choice == 2: 27 | custom_functions[0](deleting=False) 28 | elif splits: 29 | if user_choice == 1: 30 | custom_functions[0](all=True) 31 | elif user_choice == 2: 32 | custom_functions[0](all=False) 33 | elif sorts: 34 | if user_choice == 1: 35 | custom_functions[0](reverse=True) 36 | elif user_choice == 2: 37 | custom_functions[0](swap=True) 38 | else: 39 | custom_functions[0](move=True) 40 | 41 | else: 42 | custom_functions[user_choice - 1]() 43 | 44 | break 45 | 46 | except ValueError: 47 | print(Fore.RED + f'Not an Integer: Enter a digit from 1 to {self.number_of_choices}') 48 | 49 | except ChoiceNotInOptions: 50 | print(Fore.RED + f'Option not in choices: Enter a digit from 1 to {self.number_of_choices}') 51 | -------------------------------------------------------------------------------- /pdfparser/user_inputs.py: -------------------------------------------------------------------------------- 1 | from pdfparser.utilities import * 2 | from pdfparser.custom_error import * 3 | from colorama import Fore, Back, Style, init 4 | init(autoreset=True) 5 | 6 | 7 | def file_input(single_file=True): 8 | """ 9 | This function takes the file input from the user 10 | 11 | Parameters: 12 | single_file : boolean 13 | if True, single input is allowed (used for delete, sort and split) 14 | else, multiple file inputs separated by comma ( used for merge ) 15 | Returns: 16 | file : str 17 | In case of delete, sort and split 18 | list of files : list 19 | In case of merge 20 | """ 21 | while True: 22 | try: 23 | if single_file: 24 | file_name = str(input('Enter the name of file with extension(.pdf)\n')) 25 | """ Throw I/O error, if file is not present""" 26 | file_check(file_name) 27 | return file_name 28 | else: 29 | file_names = str(input('Enter the names of files with extension(.pdf) separated by a comma\n')) 30 | files = file_names.split(',') 31 | files = [item.strip() for item in files] 32 | # If only one input is given for merge through error. 33 | if len(files) == 1: 34 | raise InputMultipleFilesError 35 | for file in files: 36 | file_check(file) 37 | return files 38 | break 39 | except ValueError: 40 | print(Fore.RED + "Please enter the correct filename") 41 | except IOError as e: 42 | err_no, err_msg = e.args 43 | print(Fore.RED + f'IO Error-({err_no}) : {err_msg}') 44 | except InputMultipleFilesError: 45 | print(Fore.RED + "\nInput at least two PDF files.\n") 46 | -------------------------------------------------------------------------------- /pdfparser/utilities.py: -------------------------------------------------------------------------------- 1 | from PyPDF2 import PdfFileReader 2 | from pdfparser.custom_error import ChoiceNotInRange, EnterCombination 3 | 4 | 5 | def flatten_list(input_list): 6 | """ Flattens a nested list 7 | Parameters: 8 | ----------- 9 | input_list : list 10 | Nested list of user input 11 | Returns: 12 | flat_list : list 13 | Flattened list 14 | """ 15 | 16 | for item in input_list: 17 | if type(item) == list: 18 | flatten_list(item) 19 | else: 20 | flat_list.append(item) 21 | return flat_list 22 | 23 | 24 | def process_selections(selection_string, sorts=False): 25 | """ 26 | This function processes the user input and converts that string into a meaningful list 27 | Parameters: 28 | selection_string: str 29 | It is a processed and allowed string input by the user 30 | sorts: boolean 31 | If true, single elements are not allowed it means the list will only have range of pages [[1-3,m[2-5]] 32 | Returns: 33 | multiple_pages : list 34 | Used in sorting. A list in which hyphens are converted to comma [[1,3],[2,5]] 35 | selection_list_nested: list 36 | Used in delete and split. A list in which hyphens are converted to comma but single pages are also allowed [1,[1,3],[2,5]] 37 | 38 | """ 39 | temp_list = selection_string.split(',') 40 | single_pages = [int(i) for i in temp_list if '-' not in i] 41 | temp_multiple_pages = [i for i in temp_list if '-' in i] 42 | multiple_pages = [list(map(int, i.split('-'))) for i in temp_multiple_pages] 43 | if sorts: 44 | if len(single_pages) > 0: 45 | raise EnterCombination 46 | else: 47 | return multiple_pages 48 | else: 49 | selection_list_nested = single_pages + multiple_pages 50 | return selection_list_nested 51 | 52 | 53 | def check_pages_in_range(selection_list_nested, total_pages): 54 | """ It checks that the page numbers entered by the user is in the range of total pages of pdf 55 | Parameters: 56 | selection_list_nested:list 57 | list returned by processed_selections 58 | total_pages:int 59 | Total number of pages in the PDF to be parsed 60 | Raises: 61 | ChoiceNotInRange Error if the page number is not in range. 62 | """ 63 | global flat_list 64 | flat_list = [] 65 | temp_list = flatten_list(selection_list_nested) 66 | for item in temp_list: 67 | if item not in range(1, total_pages + 1): 68 | raise ChoiceNotInRange 69 | 70 | 71 | def file_check(pdf_path): 72 | """ Checks, whether the input file is pdf file, takes use of PyPDF2 to throw the error if the file can't be read. 73 | 74 | Returns: 75 | pdf : PyPDF2's pdf element 76 | """ 77 | with open(pdf_path, 'rb') as f: 78 | pdf = PdfFileReader(f) 79 | return pdf 80 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup 3 | 4 | # The directory containing this file 5 | HERE = pathlib.Path(__file__).parent 6 | 7 | # The text of the README file 8 | README = (HERE / "README.md").read_text() 9 | 10 | # This call to setup() does all the work 11 | setup( 12 | name="sg-pdfparser", 13 | version="1.0.2", 14 | description="A Command Line tool for parsing PDFs.", 15 | long_description=README, 16 | long_description_content_type="text/markdown", 17 | url="https://github.com/uditvashisht/sg-pdfparser", 18 | author="Udit Vashisht", 19 | author_email="admin@saralgyaan.com", 20 | license="MIT", 21 | classifiers=[ 22 | "License :: OSI Approved :: MIT License", 23 | "Programming Language :: Python :: 3.6", 24 | "Programming Language :: Python :: 3.7", 25 | "Programming Language :: Python :: 3.8", 26 | ], 27 | packages=["pdfparser"], 28 | include_package_data=True, 29 | install_requires=["pypdf2", "colorama"], 30 | entry_points={ 31 | "console_scripts": [ 32 | "pdfparser=pdfparser.__main__:main", 33 | ] 34 | }, 35 | ) 36 | --------------------------------------------------------------------------------