├── Final ├── documents │ ├── pdf-sample.pdf │ └── docx-sample.docx ├── docx2text.py ├── pipeline.py ├── pdf2text.py ├── run.py └── output │ └── test-2018-04-08-163829.txt ├── LICENSE └── Readme.md /Final/documents/pdf-sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhimishra91/corpus-creator/master/Final/documents/pdf-sample.pdf -------------------------------------------------------------------------------- /Final/documents/docx-sample.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhimishra91/corpus-creator/master/Final/documents/docx-sample.docx -------------------------------------------------------------------------------- /Final/docx2text.py: -------------------------------------------------------------------------------- 1 | from docx import Document 2 | 3 | # Function to convert Docx to Text 4 | def parse_docx2txt(fname): 5 | text = str() 6 | doc = Document(fname) 7 | for p in doc.paragraphs: 8 | text = text + p.text 9 | return text -------------------------------------------------------------------------------- /Final/pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | from docx2text import parse_docx2txt 3 | from pdf2text import parse_pdf2txt 4 | 5 | 6 | # Function to split the files and send them to different parsers 7 | def pipeline(path): 8 | text_complete = str() 9 | text_out_pdf = str() 10 | text_out_doc = str() 11 | if path == "": path = os.getcwd() + "\\" #if no pdfDir passed in 12 | for file in os.listdir(path): # iterate through file in specified path 13 | file_extension = file.split(".")[-1] 14 | filename = path + file 15 | if file_extension == "pdf": 16 | text_out_pdf = parse_pdf2txt(filename) #get string of text content of pdf 17 | else: 18 | text_out_doc = parse_docx2txt(filename) 19 | text_complete = text_complete + text_out_pdf + text_out_doc 20 | return text_complete -------------------------------------------------------------------------------- /Final/pdf2text.py: -------------------------------------------------------------------------------- 1 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 2 | from pdfminer.converter import TextConverter 3 | from pdfminer.layout import LAParams 4 | from pdfminer.pdfpage import PDFPage 5 | from io import StringIO 6 | 7 | # Function to change the pdf file to text file 8 | def parse_pdf2txt(filename, pages=None): 9 | if not pages: 10 | page_number = set() 11 | else: 12 | page_number = set(pages) 13 | 14 | output = StringIO() 15 | manager = PDFResourceManager() 16 | converter = TextConverter(manager, output, laparams=LAParams()) 17 | interpreter = PDFPageInterpreter(manager, converter) 18 | 19 | infile = open(filename, 'rb') 20 | for page in PDFPage.get_pages(infile, page_number): 21 | interpreter.process_page(page) 22 | infile.close() 23 | converter.close() 24 | text = output.getvalue() 25 | #output.close 26 | return text -------------------------------------------------------------------------------- /Final/run.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os.path 3 | from datetime import datetime 4 | from pipeline import pipeline 5 | 6 | 7 | # Function to clean text and then put it on the txt file for further training 8 | def text_cleaner(text): 9 | letters_only = re.sub("[^a-zA-Z]", " ", text) 10 | words = letters_only.lower().split() 11 | line = " ".join(words) 12 | save_path = 'output\\' 13 | filename = 'test-%s.txt' % datetime.now().strftime('%Y-%m-%d-%H%M%S') 14 | complete_name = os.path.join(save_path, filename) 15 | with open(complete_name, mode='a', encoding='UTF-8', buffering=1) as a: 16 | a.write(line) 17 | print('File Generated') 18 | 19 | 20 | # Main Function 21 | if __name__ == "__main__": 22 | path = ('documents') 23 | path = path+"\\" 24 | print('Loading the files to the pipeline..') 25 | text = pipeline(path) 26 | print("Now Converting preparing the final text file") 27 | text_cleaner(text) 28 | print('All Activity Completed') 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Abhishek Kumar Mishra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Final/output/test-2018-04-08-163829.txt: -------------------------------------------------------------------------------- 1 | this is a sample word document this is a sample word document this is a sample word document this is a sample word document this is a sample word document this is a sample word document adobe acrobat pdf files adobe portable document format pdf is a universal file format that preserves all of the fonts formatting colours and graphics of any source document regardless of the application and platform used to create it adobe pdf is an ideal format for electronic document distribution as it overcomes the problems commonly encountered with electronic file sharing anyone anywhere can open a pdf file all you need is the free adobe acrobat reader recipients of other file formats sometimes can t open files because they don t have the applications used to create the documents pdf files always print correctly on any printing device pdf files always display exactly as created regardless of fonts software and operating systems fonts and graphics are not lost due to platform software and version incompatibilities the free acrobat reader is easy to download and can be freely distributed by anyone compact pdf files are smaller than their source files and download a page at a time for fast display on the web this is a sample word document this is a sample word document this is a sample word document this is a sample word document this is a sample word document this is a sample word document -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Word Corpus Creator 2 | 3 | This tool can be used to create a word corpus from locally available documents.Word Corpus are required to build word embeddings for certain Natural Language Processing tasks. 4 | This tool will convert the documents present in the `documents` folder into a single clean txt file that can be then passed to a word vector generator such as [GloVe](https://github.com/stanfordnlp/GloVe) created by Stanford. 5 | 6 | ## Description of the files/folders: 7 | * `Final`: Contains all the necessary files for the tool kit 8 | * `documents`: Put all the documents that you want to convert into this folder. Currently it can accept: `pdf`,`docx` 9 | * `docx2text.py`: Converts the passed docx file to text 10 | * `pdf2text.py`: Converts the passed pdf to text 11 | * `pipeline.py`: Picks the file from the documents folder and passes it to the correct converter 12 | * `run.py`: This file is supposed to be executed to get the necessary text file output. This will clean the text generated from all the files and save the created text file in the correct location 13 | * `output`: This folder will have the text file as the output. 14 | 15 | ### NOTE: 16 | Final output will be a single text file that will be a combination of all the files in the document folder. 17 | Some sample documents are already saved in the `documents` folder for you to quickly test. 18 | 19 | ## Requirement: 20 | * Python: 2.7 and above 21 | * For pdf conversion 22 | - *pdfminer*(python 2.7) 23 | - *pdfminer.six*(python 3) 24 | * For docx conversion 25 | - python-docx 26 | --------------------------------------------------------------------------------