├── requirements.txt ├── README.md └── scripts ├── helper_scripts ├── movestuff.sh ├── README.md └── gdrive_upload_bulk.py ├── experimental ├── README.md └── Trigraph_Finder.py ├── NLP ├── wordripper.py ├── sentence_tokenize.py ├── stopword_filter.py ├── speechparts.py ├── bulk-wordripper.py ├── bulk-speechparts.py └── README.md └── OCR ├── local OCR processing ├── striptext.sh └── OCR.sh ├── google cloud processing ├── client_secret.json └── OCR_drive.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | ocrmypdf 2 | tesseract-ocr 3 | unpaper 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KrangTools 2 | KrangTools is a set of scripts that can be used in text processing for OCR and NLP pipeline building. 3 | -------------------------------------------------------------------------------- /scripts/helper_scripts/movestuff.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | for file in $(ls -p | grep -v / | head -15000) 3 | do 4 | # Change this value 5 | mv $file /dir/foo 6 | done 7 | -------------------------------------------------------------------------------- /scripts/experimental/README.md: -------------------------------------------------------------------------------- 1 | # Experimental! All scripts in this file are considered experimental! They won't hurt anything, but they also may not function cleanly 2 | -------------------------------------------------------------------------------- /scripts/NLP/wordripper.py: -------------------------------------------------------------------------------- 1 | import io 2 | from nltk.tokenize import sent_tokenize, word_tokenize 3 | 4 | with io.open('filename.txt', 'r', encoding="UTF8") as myfile: 5 | data=myfile.read() 6 | 7 | print(word_tokenize(data)) 8 | -------------------------------------------------------------------------------- /scripts/OCR/local OCR processing/striptext.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #This one-liner will rip raw text files from PDFs producing a text file with the same name in the same directory 4 | for file in *.pdf; do pdftotext "$file" "$file.txt"; done 5 | -------------------------------------------------------------------------------- /scripts/NLP/sentence_tokenize.py: -------------------------------------------------------------------------------- 1 | import io 2 | from nltk.tokenize import sent_tokenize, word_tokenize 3 | 4 | with io.open('filename.txt', 'r', encoding="UTF8") as myfile: 5 | data=myfile.read().replace('\n', '') 6 | 7 | print(sent_tokenize(data)) 8 | -------------------------------------------------------------------------------- /scripts/NLP/stopword_filter.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import stopwords 2 | from nltk.tokenize import word_tokenize 3 | 4 | filename = "filename.txt" 5 | file = open(filename, "r") 6 | for line in file: 7 | stop_words = set(stopwords.words("english")) 8 | print(stop_words) 9 | -------------------------------------------------------------------------------- /scripts/NLP/speechparts.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import io 3 | from nltk.tokenize import sent_tokenize, word_tokenize 4 | 5 | with io.open('filename.txt', 'r', encoding="UTF8") as myfile: 6 | data=myfile.read().replace('\n', '') 7 | 8 | text = word_tokenize(data) 9 | finished = nltk.pos_tag(text) 10 | 11 | print(finished) 12 | -------------------------------------------------------------------------------- /scripts/OCR/local OCR processing/OCR.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | for f in *.pdf; 4 | do 5 | echo "Running OCR on "$f"" 6 | ocrmypdf -v --deskew --clean --clean-final "$f" "$f"; 7 | 8 | echo "moving ocr'd file" 9 | # Make sure this value is changed to fit your destination! 10 | mv "$f" /root/OCRd 11 | 12 | echo "finished" 13 | done 14 | -------------------------------------------------------------------------------- /scripts/NLP/bulk-wordripper.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | from nltk.tokenize import sent_tokenize, word_tokenize 4 | 5 | for filename in os.listdir("."): 6 | if filename.endswith(".txt"): 7 | with io.open(filename, 'r', encoding="UTF8") as f: 8 | data=f.read() 9 | print(word_tokenize(data)) 10 | else: 11 | continue 12 | -------------------------------------------------------------------------------- /scripts/OCR/google cloud processing/client_secret.json: -------------------------------------------------------------------------------- 1 | {"installed":{"client_id":" 19 | ``` 20 | 21 | ## OCR.sh 22 | Recommend you use this with tmux, especially if you're processing through a high volume of files, most especially if your files have a high number of pages in them. 23 | 24 | ## striptext.sh 25 | This script will help rip raw text out of your PDFs. It will dump raw text files out with the same name as the original file. 26 | 27 | ## OCR_Drive.py 28 | This script supports the Google docs upload method. It's much less resource intensive and much faster per document, but there are things you should take note of before using it: 29 | 30 | THIS SCRIPT WILL REPLACE THE ORIGINAL PDF FILE! Make sure you have copies if you want to keep the originals intact! 31 | Be aware that this will only work for documents under 5mb in size and you will need an active network connection to upload and download the pdf files you wish to process. 32 | 33 | You will also need to have a google drive api key in order to make this work. 34 | 35 | Credit for the singule document processing script goes out to https://tanaikech.github.io/2017/05/02/ocr-using-google-drive-api/ 36 | Great thanks to @glitchliz for helping with the failover protections! 37 | -------------------------------------------------------------------------------- /scripts/OCR/google cloud processing/OCR_drive.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import httplib2 3 | import os 4 | import io 5 | import shutil 6 | import time 7 | 8 | from apiclient import discovery 9 | from oauth2client import client 10 | from oauth2client import tools 11 | from oauth2client.file import Storage 12 | from apiclient.http import MediaFileUpload, MediaIoBaseDownload 13 | 14 | start = time.time() 15 | 16 | try: 17 | import argparse 18 | flags = argparse.ArgumentParser(parents=[tools.argparser]).parse_args() 19 | except ImportError: 20 | flags = None 21 | 22 | # If modifying these scopes, delete your previously saved credentials 23 | # at ~/.credentials/drive-python-quickstart.json 24 | SCOPES = 'https://www.googleapis.com/auth/drive' 25 | CLIENT_SECRET_FILE = 'client_secret.json' 26 | APPLICATION_NAME = 'Drive API Python Quickstart' 27 | 28 | 29 | def get_credentials(): 30 | """Gets valid user credentials from storage. 31 | If nothing has been stored, or if the stored credentials are invalid, 32 | the OAuth2 flow is completed to obtain the new credentials. 33 | Returns: 34 | Credentials, the obtained credential. 35 | """ 36 | credential_path = os.path.join("./", 'drive-python-quickstart.json') 37 | store = Storage(credential_path) 38 | credentials = store.get() 39 | if not credentials or credentials.invalid: 40 | flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES) 41 | flow.user_agent = APPLICATION_NAME 42 | if flags: 43 | credentials = tools.run_flow(flow, store, flags) 44 | else: # Needed only for compatibility with Python 2.6 45 | credentials = tools.run(flow, store) 46 | print('Storing credentials to ' + credential_path) 47 | return credentials 48 | 49 | 50 | def main(): 51 | for filename in os.listdir("."): 52 | if filename.endswith(".pdf"): 53 | credentials = get_credentials() 54 | http = credentials.authorize(httplib2.Http()) 55 | service = discovery.build('drive', 'v3', http=http) 56 | 57 | imgfile = filename # Image with texts (png, jpg, bmp, gif, pdf) 58 | txtfile = filename # Text file outputted by OCR 59 | # change these values for the path of the locally stored files, and where you want them moved after processing 60 | path = '/media/Archives/CREST/copycopy/' 61 | finDir = '/media/Archives/CREST/donedone/' 62 | rejDir = '/media/Archives/CREST/rejected' 63 | 64 | # This will help you if the script ever dies while processing a certain file 65 | print (filename) 66 | try: 67 | 68 | 69 | mime = 'application/vnd.google-apps.document' 70 | res = service.files().create( 71 | body={ 72 | 'name': imgfile, 73 | 'mimeType': mime 74 | # 'uploadType': multipart 75 | }, 76 | media_body=MediaFileUpload(imgfile, mimetype=mime, resumable=True) 77 | ).execute() 78 | 79 | downloader = MediaIoBaseDownload( 80 | io.FileIO(txtfile, 'wb'), 81 | service.files().export_media(fileId=res['id'], mimeType="text/plain") 82 | ) 83 | done = False 84 | while done is False: 85 | status, done = downloader.next_chunk() 86 | shutil.move(os.path.join(path, filename), finDir) 87 | service.files().delete(fileId=res['id']).execute() 88 | except: 89 | shutil.move(os.path.join(path, filename), rejDir) 90 | print("Done.") 91 | 92 | end = time.time() 93 | print("I have been running for", end - start, "seconds") 94 | 95 | if __name__ == '__main__': 96 | main() 97 | -------------------------------------------------------------------------------- /scripts/helper_scripts/gdrive_upload_bulk.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | import httplib2 5 | from mimetypes import guess_type 6 | 7 | # Following libraries can be installed by executing: 8 | # sudo pip install --upgrade google-api-python-client 9 | from apiclient.discovery import build 10 | from apiclient.http import MediaFileUpload 11 | from apiclient.errors import ResumableUploadError 12 | from oauth2client.client import OAuth2WebServerFlow 13 | from oauth2client.file import Storage 14 | 15 | credentials = "" 16 | 17 | # Log only oauth2client errors 18 | logging.basicConfig(level="ERROR") 19 | 20 | # Path to token json file, it should be in same directory as script 21 | token_file = sys.path[0] + '/auth_token.txt' 22 | 23 | # Copy your credentials from the APIs Console 24 | CLIENT_ID = '' 25 | CLIENT_SECRET = '' 26 | # Check https://developers.google.com/drive/scopes for all available scopes 27 | OAUTH_SCOPE = 'https://www.googleapis.com/auth/drive.file' 28 | # Redirect URI for installed apps, can be left as is 29 | REDIRECT_URI = 'urn:ietf:wg:oauth:2.0:oob' 30 | 31 | for filename in os.listdir("."): 32 | if filename.endswith(".pdf"): 33 | # Get mime type and name of given file 34 | def file_ops(filename): 35 | mime_type = guess_type(filename)[0] 36 | mime_type = mime_type if mime_type else 'text/plain' 37 | # file_name = filename.split('/')[-1] 38 | # return file_name, mime_type 39 | 40 | 41 | def create_token_file(token_file): 42 | # Run through the OAuth flow and retrieve credentials 43 | flow = OAuth2WebServerFlow( 44 | CLIENT_ID, 45 | CLIENT_SECRET, 46 | OAUTH_SCOPE, 47 | redirect_uri=REDIRECT_URI 48 | ) 49 | authorize_url = flow.step1_get_authorize_url() 50 | print('Go to the following link in your browser: ' + authorize_url) 51 | code = raw_input('Enter verification code: ').strip() 52 | credentials = flow.step2_exchange(code) 53 | storage = Storage(token_file) 54 | storage.put(credentials) 55 | return storage 56 | 57 | 58 | def authorize(token_file, storage): 59 | # Get credentials 60 | if storage is None: 61 | storage = Storage(token_file) 62 | credentials = storage.get() 63 | # Create an httplib2.Http object and authorize it with our credentials 64 | http = httplib2.Http() 65 | credentials.refresh(http) 66 | http = credentials.authorize(http) 67 | return http 68 | 69 | 70 | def upload_file(filename, mime_type): 71 | # Create Google Drive service instance 72 | drive_service = build('drive', 'v2', http=http) 73 | # File body description 74 | media_body = MediaFileUpload(filename, 75 | mimetype=mime_type, 76 | resumable=True) 77 | body = { 78 | 'title': filename, 79 | 'description': 'backup', 80 | 'mimeType': mime_type, 81 | } 82 | # Permissions body description: anyone who has link can upload 83 | # Other permissions can be found at https://developers.google.com/drive/v2/reference/permissions 84 | permissions = { 85 | 'role': 'reader', 86 | 'type': 'anyone', 87 | 'value': None, 88 | 'withLink': True 89 | } 90 | # Insert a file 91 | file = drive_service.files().insert(body=body, media_body=media_body).execute() 92 | # Insert new permissions 93 | drive_service.permissions().insert(fileId=file['id'], body=permissions).execute() 94 | # Define file instance and get url for download 95 | file = drive_service.files().get(fileId=file['id']).execute() 96 | download_url = file.get('webContentLink') 97 | return download_url 98 | 99 | if __name__ == '__main__': 100 | try: 101 | with open(filename) as f: pass 102 | except IOError as e: 103 | print(e) 104 | sys.exit(1) 105 | # Check if token file exists, if not create it by requesting authorization code 106 | try: 107 | with open(token_file) as f: pass 108 | except IOError: 109 | http = authorize(token_file, create_token_file(token_file)) 110 | # Authorize, get file parameters, upload file and print out result URL for download 111 | http = authorize(token_file, None) 112 | mime_type = file_ops(filename) 113 | # Sometimes API fails to retrieve starting URI, we wrap it. 114 | try: 115 | print(upload_file(filename, mime_type)) 116 | except ResumableUploadError as e: 117 | print("Error occured while first upload try:", e) 118 | print("Trying one more time.") 119 | print(upload_file(filename, mime_type)) 120 | --------------------------------------------------------------------------------