├── translate_pdfs ├── __init__.py └── fonts │ ├── NotoSerif-Bold.ttf │ ├── NotoSerif-Italic.ttf │ ├── NotoSerif-Regular.ttf │ ├── NotoSerif-BoldItalic.ttf │ ├── __init__.py │ └── fonts.py ├── .gitignore ├── example.pdf ├── lv_example.pdf ├── AWS_lv_example.pdf ├── translator.py ├── aws_translator.py └── README.md /translate_pdfs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea/ 3 | *.pyc 4 | -------------------------------------------------------------------------------- /example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akapne01/translator/HEAD/example.pdf -------------------------------------------------------------------------------- /lv_example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akapne01/translator/HEAD/lv_example.pdf -------------------------------------------------------------------------------- /AWS_lv_example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akapne01/translator/HEAD/AWS_lv_example.pdf -------------------------------------------------------------------------------- /translate_pdfs/fonts/NotoSerif-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akapne01/translator/HEAD/translate_pdfs/fonts/NotoSerif-Bold.ttf -------------------------------------------------------------------------------- /translate_pdfs/fonts/NotoSerif-Italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akapne01/translator/HEAD/translate_pdfs/fonts/NotoSerif-Italic.ttf -------------------------------------------------------------------------------- /translate_pdfs/fonts/NotoSerif-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akapne01/translator/HEAD/translate_pdfs/fonts/NotoSerif-Regular.ttf -------------------------------------------------------------------------------- /translate_pdfs/fonts/NotoSerif-BoldItalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/akapne01/translator/HEAD/translate_pdfs/fonts/NotoSerif-BoldItalic.ttf -------------------------------------------------------------------------------- /translate_pdfs/fonts/__init__.py: -------------------------------------------------------------------------------- 1 | from translate_pdfs.fonts.fonts import regular, style, bold, italic, \ 2 | boldItalic, heading1, heading2, heading3, heading4, heading5 3 | -------------------------------------------------------------------------------- /translator.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from PyPDF2 import PdfFileReader 4 | from googletrans import Translator 5 | from reportlab.lib.pagesizes import letter 6 | from reportlab.platypus import Paragraph 7 | from reportlab.platypus import SimpleDocTemplate 8 | 9 | from translate_pdfs.fonts import * 10 | 11 | """ 12 | This script uses Google Translate library to translate the PDF 13 | """ 14 | 15 | """ 16 | Constants 17 | """ 18 | URL_COM = 'translate.googleapis.com' 19 | LANG = "lv" 20 | 21 | """ 22 | FUNCTIONS 23 | """ 24 | 25 | 26 | def get_translated_page_content(reader, lang): 27 | """ 28 | Reads page content from the reader, translates it, 29 | cleans it and returns page content as a list of strings. 30 | Each entry in list represents a page 31 | """ 32 | num_pages = reader.numPages 33 | page_contents = [] 34 | translator = Translator(service_urls=[URL_COM]) 35 | for p in range(num_pages): 36 | page = reader.getPage(p) 37 | text = page.extractText() 38 | translation = translator.translate(text, dest=lang) 39 | result_text = translation.text.replace("\n", " ").replace("W", "") 40 | page_contents.append(result_text) 41 | return page_contents 42 | 43 | 44 | def translate_pdf(path, lang): 45 | file = open(path, 'rb') 46 | reader = PdfFileReader(file) 47 | page_contents = get_translated_page_content(reader, lang) 48 | 49 | page_text = [] 50 | name = f'{LANG}_{path}' 51 | pdf = SimpleDocTemplate(name, pagesize=letter) 52 | 53 | for text in page_contents: 54 | page_text.append( 55 | Paragraph(text, encoding='utf-8', style=regular)) 56 | 57 | pdf.build(page_text) 58 | 59 | 60 | if __name__ == '__main__': 61 | file_name = "example.pdf" 62 | translate_pdf(file_name, LANG) 63 | -------------------------------------------------------------------------------- /aws_translator.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from PyPDF2 import PdfFileReader 3 | from reportlab.lib.pagesizes import A4 4 | from reportlab.platypus import SimpleDocTemplate, Paragraph 5 | 6 | from translate_pdfs.fonts import regular 7 | 8 | """ 9 | This script uses AWS Translator to translate the PDF. 10 | Please note to change the region to the one you wish to use. 11 | """ 12 | 13 | LANG = "lv" 14 | AWS_REGION = 'eu-west-1' 15 | 16 | 17 | def get_translated_page_content(reader, lang): 18 | """ 19 | Reads page content from the reader, translates it, 20 | cleans it and returns page content as a list of strings. 21 | Each entry in list represents a page 22 | """ 23 | num_pages = reader.numPages 24 | page_contents = [] 25 | translate = boto3.client(service_name='translate', 26 | region_name=AWS_REGION, 27 | use_ssl=True) 28 | 29 | for p in range(num_pages): 30 | page = reader.getPage(p) 31 | 32 | result = translate.translate_text(Text=page.extractText(), 33 | SourceLanguageCode="auto", 34 | TargetLanguageCode=lang) 35 | translation = result.get('TranslatedText') 36 | 37 | result_text = translation.replace("\n", " ").replace("W", "") 38 | page_contents.append(result_text) 39 | return page_contents 40 | 41 | 42 | def translate_pdf(path, lang): 43 | file = open(path, 'rb') 44 | reader = PdfFileReader(file) 45 | page_contents = get_translated_page_content(reader, lang) 46 | 47 | page_text = [] 48 | name = f'AWS_{LANG}_{path}' 49 | pdf = SimpleDocTemplate(name, pagesize=A4) 50 | 51 | for text in page_contents: 52 | page_text.append( 53 | Paragraph(text, encoding='utf-8', style=regular)) 54 | 55 | pdf.build(page_text) 56 | 57 | 58 | if __name__ == '__main__': 59 | file_name = "example.pdf" 60 | translate_pdf(file_name, LANG) 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### PDF Translator 2 | 3 | This projects reads the text content from PDF files, translates it and saves 4 | translated text as a formatted PDF file. 5 | 6 | Two different translating options can be used: 7 | 8 | 1) Google translate python library googletrans (can be installed via pip 9 | ). To use this option, use script python translator.py 10 | 2) AWS Translate. This option requires you to have AWS account. You need to 11 | obtain the AWS access key and a secret access key and configure them either 12 | as environment variables or local files. To use AWS Translate: python aws_translator.py 13 | 14 | Additional python packages that are required to install are: 15 | * PyPDF2 16 | * reportlab 17 | * boto3 (AWS Python SDK) 18 | 19 | #### AWS charges: 20 | AWS offer free tier to try out the AWS Translator. Free tier included 2 21 | million characters for 12 months. Please note that after that the charges 22 | will occur and AWS will charge $15 per million characters. 23 | More info: Click Here for more info 24 | 25 | #### If you get googletrans error, here is how to fix it: 26 | Please note that there was a breaking change for googletrans. If you get error: error in result (AttributeError: 'NoneType' object has no attribute 'group') 27 | Then to fix you need to do 2 things: 28 | 1) Change URL to URL_COM = 'translate.googleapis.com' 29 | 2) Install the latest version of Google Translate: pip install googletrans==3.1.0a0 30 | It fixed the issue. More information about the issue: 31 | https://stackoverflow.com/questions/52455774/googletrans-stopped-working-with-error-nonetype-object-has-no-attribute-group#52456197 32 | 33 | #### How you can translate your own file? 34 | 1) Change language to which you would like to translate to: 35 | LANG = "lv" (ln: 19 in translator.py & ln: 13 in aws_translator.py) 36 | 2) Change the file name from file_name = "example.pdf". Raplace example.pdf to match the pdf file name you have. 37 | (ln: 61 in translator.py & ln: 59 in aws_translator.py) -------------------------------------------------------------------------------- /translate_pdfs/fonts/fonts.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet 4 | from reportlab.pdfbase import pdfmetrics 5 | from reportlab.pdfbase.ttfonts import TTFont 6 | 7 | directory = os.path.join(os.getcwd(), "translate_pdfs", "fonts") 8 | print(directory) 9 | """ 10 | CONSTANTS 11 | """ 12 | BODY_FONT_SIZE = 12 13 | TITLE_FONT_SIZE = 14 14 | ALIGNMENT_RIGHT = 0 15 | SPACING_BETWEEN_LINES = 20 16 | SPACE_AFTER = 14 17 | FIRST_LINE_INDENT = 12 18 | REGULAR_FONT_NAME = 'NotoSerif-Regular' 19 | BOLD_FONT_NAME = 'NotoSerif-Bold' 20 | ITALIC_FONT_NAME = 'NotoSerif-Italic' 21 | BOLDITALIC_FONT_NAME = 'NotoSerif-BoldItalic' 22 | 23 | REGULAR_FONT_FILE = os.path.join(directory, REGULAR_FONT_NAME + ".ttf") 24 | BOLD_FONT_FILE = os.path.join(directory, BOLD_FONT_NAME + ".ttf") 25 | ITALIC_FONT_FILE = os.path.join(directory, ITALIC_FONT_NAME + ".ttf") 26 | BOLDITALIC_FONT_FILE = os.path.join(directory, BOLDITALIC_FONT_NAME + ".ttf") 27 | 28 | """ 29 | Registers Custom TTF fonts that are in the fonts directory. 30 | """ 31 | pdfmetrics.registerFont(TTFont('NotoSerif-Regular', REGULAR_FONT_FILE)) 32 | pdfmetrics.registerFont(TTFont('NotoSerif-BoldItalic', BOLDITALIC_FONT_FILE)) 33 | pdfmetrics.registerFont(TTFont('NotoSerif-Bold', BOLD_FONT_FILE)) 34 | pdfmetrics.registerFont(TTFont('NotoSerif-Italic', ITALIC_FONT_FILE)) 35 | 36 | style = getSampleStyleSheet() 37 | 38 | 39 | def create_body_font(name): 40 | """ 41 | Defines a paragraph style for the main text body. 42 | """ 43 | return ParagraphStyle(name, 44 | fontName=name, 45 | fontSize=BODY_FONT_SIZE, 46 | parent=style['Normal'], 47 | alignment=ALIGNMENT_RIGHT, 48 | leading=SPACING_BETWEEN_LINES, 49 | firstLineIndent=FIRST_LINE_INDENT, 50 | spaceAfter=SPACE_AFTER) 51 | 52 | 53 | def create_title_font(name, style_name): 54 | """ 55 | Defines a paragraph style for titles 56 | """ 57 | return ParagraphStyle(name, 58 | fontName=name, 59 | fontSize=TITLE_FONT_SIZE, 60 | parent=style[style_name], 61 | alignment=ALIGNMENT_RIGHT, 62 | leading=SPACING_BETWEEN_LINES, 63 | firstLineIndent=FIRST_LINE_INDENT, 64 | spaceAfter=SPACE_AFTER) 65 | 66 | 67 | """ 68 | Creates various types of fonts that can be imported and used to format PDF files 69 | """ 70 | regular = create_body_font(REGULAR_FONT_NAME) 71 | bold = create_body_font(BOLD_FONT_NAME) 72 | italic = create_body_font(ITALIC_FONT_NAME) 73 | boldItalic = create_body_font(BOLDITALIC_FONT_NAME) 74 | heading1 = create_title_font(BOLD_FONT_NAME, 'Heading1') 75 | heading2 = create_title_font(BOLD_FONT_NAME, 'Heading2') 76 | heading3 = create_title_font(BOLD_FONT_NAME, 'Heading3') 77 | heading4 = create_title_font(BOLD_FONT_NAME, 'Heading4') 78 | heading5 = create_title_font(BOLD_FONT_NAME, 'Heading5') 79 | --------------------------------------------------------------------------------