├── translate_pdfs
├── __init__.py
└── fonts
│ ├── NotoSerif-Bold.ttf
│ ├── NotoSerif-Italic.ttf
│ ├── NotoSerif-Regular.ttf
│ ├── NotoSerif-BoldItalic.ttf
│ ├── __init__.py
│ └── fonts.py
├── .gitignore
├── example.pdf
├── lv_example.pdf
├── AWS_lv_example.pdf
├── translator.py
├── aws_translator.py
└── README.md
/translate_pdfs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea/
3 | *.pyc
4 |
--------------------------------------------------------------------------------
/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akapne01/translator/HEAD/example.pdf
--------------------------------------------------------------------------------
/lv_example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akapne01/translator/HEAD/lv_example.pdf
--------------------------------------------------------------------------------
/AWS_lv_example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akapne01/translator/HEAD/AWS_lv_example.pdf
--------------------------------------------------------------------------------
/translate_pdfs/fonts/NotoSerif-Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akapne01/translator/HEAD/translate_pdfs/fonts/NotoSerif-Bold.ttf
--------------------------------------------------------------------------------
/translate_pdfs/fonts/NotoSerif-Italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akapne01/translator/HEAD/translate_pdfs/fonts/NotoSerif-Italic.ttf
--------------------------------------------------------------------------------
/translate_pdfs/fonts/NotoSerif-Regular.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akapne01/translator/HEAD/translate_pdfs/fonts/NotoSerif-Regular.ttf
--------------------------------------------------------------------------------
/translate_pdfs/fonts/NotoSerif-BoldItalic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/akapne01/translator/HEAD/translate_pdfs/fonts/NotoSerif-BoldItalic.ttf
--------------------------------------------------------------------------------
/translate_pdfs/fonts/__init__.py:
--------------------------------------------------------------------------------
1 | from translate_pdfs.fonts.fonts import regular, style, bold, italic, \
2 | boldItalic, heading1, heading2, heading3, heading4, heading5
3 |
--------------------------------------------------------------------------------
/translator.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from PyPDF2 import PdfFileReader
4 | from googletrans import Translator
5 | from reportlab.lib.pagesizes import letter
6 | from reportlab.platypus import Paragraph
7 | from reportlab.platypus import SimpleDocTemplate
8 |
9 | from translate_pdfs.fonts import *
10 |
11 | """
12 | This script uses Google Translate library to translate the PDF
13 | """
14 |
15 | """
16 | Constants
17 | """
18 | URL_COM = 'translate.googleapis.com'
19 | LANG = "lv"
20 |
21 | """
22 | FUNCTIONS
23 | """
24 |
25 |
26 | def get_translated_page_content(reader, lang):
27 | """
28 | Reads page content from the reader, translates it,
29 | cleans it and returns page content as a list of strings.
30 | Each entry in list represents a page
31 | """
32 | num_pages = reader.numPages
33 | page_contents = []
34 | translator = Translator(service_urls=[URL_COM])
35 | for p in range(num_pages):
36 | page = reader.getPage(p)
37 | text = page.extractText()
38 | translation = translator.translate(text, dest=lang)
39 | result_text = translation.text.replace("\n", " ").replace("W", "")
40 | page_contents.append(result_text)
41 | return page_contents
42 |
43 |
44 | def translate_pdf(path, lang):
45 | file = open(path, 'rb')
46 | reader = PdfFileReader(file)
47 | page_contents = get_translated_page_content(reader, lang)
48 |
49 | page_text = []
50 | name = f'{LANG}_{path}'
51 | pdf = SimpleDocTemplate(name, pagesize=letter)
52 |
53 | for text in page_contents:
54 | page_text.append(
55 | Paragraph(text, encoding='utf-8', style=regular))
56 |
57 | pdf.build(page_text)
58 |
59 |
60 | if __name__ == '__main__':
61 | file_name = "example.pdf"
62 | translate_pdf(file_name, LANG)
63 |
--------------------------------------------------------------------------------
/aws_translator.py:
--------------------------------------------------------------------------------
1 | import boto3
2 | from PyPDF2 import PdfFileReader
3 | from reportlab.lib.pagesizes import A4
4 | from reportlab.platypus import SimpleDocTemplate, Paragraph
5 |
6 | from translate_pdfs.fonts import regular
7 |
8 | """
9 | This script uses AWS Translator to translate the PDF.
10 | Please note to change the region to the one you wish to use.
11 | """
12 |
13 | LANG = "lv"
14 | AWS_REGION = 'eu-west-1'
15 |
16 |
17 | def get_translated_page_content(reader, lang):
18 | """
19 | Reads page content from the reader, translates it,
20 | cleans it and returns page content as a list of strings.
21 | Each entry in list represents a page
22 | """
23 | num_pages = reader.numPages
24 | page_contents = []
25 | translate = boto3.client(service_name='translate',
26 | region_name=AWS_REGION,
27 | use_ssl=True)
28 |
29 | for p in range(num_pages):
30 | page = reader.getPage(p)
31 |
32 | result = translate.translate_text(Text=page.extractText(),
33 | SourceLanguageCode="auto",
34 | TargetLanguageCode=lang)
35 | translation = result.get('TranslatedText')
36 |
37 | result_text = translation.replace("\n", " ").replace("W", "")
38 | page_contents.append(result_text)
39 | return page_contents
40 |
41 |
42 | def translate_pdf(path, lang):
43 | file = open(path, 'rb')
44 | reader = PdfFileReader(file)
45 | page_contents = get_translated_page_content(reader, lang)
46 |
47 | page_text = []
48 | name = f'AWS_{LANG}_{path}'
49 | pdf = SimpleDocTemplate(name, pagesize=A4)
50 |
51 | for text in page_contents:
52 | page_text.append(
53 | Paragraph(text, encoding='utf-8', style=regular))
54 |
55 | pdf.build(page_text)
56 |
57 |
58 | if __name__ == '__main__':
59 | file_name = "example.pdf"
60 | translate_pdf(file_name, LANG)
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ### PDF Translator
2 |
3 | This projects reads the text content from PDF files, translates it and saves
4 | translated text as a formatted PDF file.
5 |
6 | Two different translating options can be used:
7 |
8 | 1) Google translate python library googletrans (can be installed via pip
9 | ). To use this option, use script python translator.py
10 | 2) AWS Translate. This option requires you to have AWS account. You need to
11 | obtain the AWS access key and a secret access key and configure them either
12 | as environment variables or local files. To use AWS Translate: python aws_translator.py
13 |
14 | Additional python packages that are required to install are:
15 | * PyPDF2
16 | * reportlab
17 | * boto3 (AWS Python SDK)
18 |
19 | #### AWS charges:
20 | AWS offer free tier to try out the AWS Translator. Free tier included 2
21 | million characters for 12 months. Please note that after that the charges
22 | will occur and AWS will charge $15 per million characters.
23 | More info: Click Here for more info
24 |
25 | #### If you get googletrans error, here is how to fix it:
26 | Please note that there was a breaking change for googletrans. If you get error: error in result (AttributeError: 'NoneType' object has no attribute 'group')
27 | Then to fix you need to do 2 things:
28 | 1) Change URL to URL_COM = 'translate.googleapis.com'
29 | 2) Install the latest version of Google Translate: pip install googletrans==3.1.0a0
30 | It fixed the issue. More information about the issue:
31 | https://stackoverflow.com/questions/52455774/googletrans-stopped-working-with-error-nonetype-object-has-no-attribute-group#52456197
32 |
33 | #### How you can translate your own file?
34 | 1) Change language to which you would like to translate to:
35 | LANG = "lv" (ln: 19 in translator.py & ln: 13 in aws_translator.py)
36 | 2) Change the file name from file_name = "example.pdf". Raplace example.pdf to match the pdf file name you have.
37 | (ln: 61 in translator.py & ln: 59 in aws_translator.py)
--------------------------------------------------------------------------------
/translate_pdfs/fonts/fonts.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from reportlab.lib.styles import ParagraphStyle, getSampleStyleSheet
4 | from reportlab.pdfbase import pdfmetrics
5 | from reportlab.pdfbase.ttfonts import TTFont
6 |
7 | directory = os.path.join(os.getcwd(), "translate_pdfs", "fonts")
8 | print(directory)
9 | """
10 | CONSTANTS
11 | """
12 | BODY_FONT_SIZE = 12
13 | TITLE_FONT_SIZE = 14
14 | ALIGNMENT_RIGHT = 0
15 | SPACING_BETWEEN_LINES = 20
16 | SPACE_AFTER = 14
17 | FIRST_LINE_INDENT = 12
18 | REGULAR_FONT_NAME = 'NotoSerif-Regular'
19 | BOLD_FONT_NAME = 'NotoSerif-Bold'
20 | ITALIC_FONT_NAME = 'NotoSerif-Italic'
21 | BOLDITALIC_FONT_NAME = 'NotoSerif-BoldItalic'
22 |
23 | REGULAR_FONT_FILE = os.path.join(directory, REGULAR_FONT_NAME + ".ttf")
24 | BOLD_FONT_FILE = os.path.join(directory, BOLD_FONT_NAME + ".ttf")
25 | ITALIC_FONT_FILE = os.path.join(directory, ITALIC_FONT_NAME + ".ttf")
26 | BOLDITALIC_FONT_FILE = os.path.join(directory, BOLDITALIC_FONT_NAME + ".ttf")
27 |
28 | """
29 | Registers Custom TTF fonts that are in the fonts directory.
30 | """
31 | pdfmetrics.registerFont(TTFont('NotoSerif-Regular', REGULAR_FONT_FILE))
32 | pdfmetrics.registerFont(TTFont('NotoSerif-BoldItalic', BOLDITALIC_FONT_FILE))
33 | pdfmetrics.registerFont(TTFont('NotoSerif-Bold', BOLD_FONT_FILE))
34 | pdfmetrics.registerFont(TTFont('NotoSerif-Italic', ITALIC_FONT_FILE))
35 |
36 | style = getSampleStyleSheet()
37 |
38 |
39 | def create_body_font(name):
40 | """
41 | Defines a paragraph style for the main text body.
42 | """
43 | return ParagraphStyle(name,
44 | fontName=name,
45 | fontSize=BODY_FONT_SIZE,
46 | parent=style['Normal'],
47 | alignment=ALIGNMENT_RIGHT,
48 | leading=SPACING_BETWEEN_LINES,
49 | firstLineIndent=FIRST_LINE_INDENT,
50 | spaceAfter=SPACE_AFTER)
51 |
52 |
53 | def create_title_font(name, style_name):
54 | """
55 | Defines a paragraph style for titles
56 | """
57 | return ParagraphStyle(name,
58 | fontName=name,
59 | fontSize=TITLE_FONT_SIZE,
60 | parent=style[style_name],
61 | alignment=ALIGNMENT_RIGHT,
62 | leading=SPACING_BETWEEN_LINES,
63 | firstLineIndent=FIRST_LINE_INDENT,
64 | spaceAfter=SPACE_AFTER)
65 |
66 |
67 | """
68 | Creates various types of fonts that can be imported and used to format PDF files
69 | """
70 | regular = create_body_font(REGULAR_FONT_NAME)
71 | bold = create_body_font(BOLD_FONT_NAME)
72 | italic = create_body_font(ITALIC_FONT_NAME)
73 | boldItalic = create_body_font(BOLDITALIC_FONT_NAME)
74 | heading1 = create_title_font(BOLD_FONT_NAME, 'Heading1')
75 | heading2 = create_title_font(BOLD_FONT_NAME, 'Heading2')
76 | heading3 = create_title_font(BOLD_FONT_NAME, 'Heading3')
77 | heading4 = create_title_font(BOLD_FONT_NAME, 'Heading4')
78 | heading5 = create_title_font(BOLD_FONT_NAME, 'Heading5')
79 |
--------------------------------------------------------------------------------