├── .gitignore
├── .env.sample
├── requirements.txt
├── Dockerfile
├── index.html
├── README.md
└── main.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .env*
2 | !.env.sample
3 |
--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | export HTML2PPTX_DEBUG_LOGS="False"
4 | export HTML2PPTX_DEBUG_SLIDES="False"
5 | export HTML2PPTX_PORT="8080"
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.7.1
2 | certifi==2019.3.9
3 | chardet==3.0.4
4 | idna==2.8
5 | lxml==4.3.3
6 | Pillow==6.2.0
7 | python-pptx==0.6.18
8 | requests==2.22.0
9 | soupsieve==1.9.1
10 | urllib3==1.25.3
11 | XlsxWriter==1.1.8
12 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3
2 |
3 | WORKDIR /app
4 |
5 | COPY requirements.txt /app
6 | RUN pip install --no-cache-dir -r requirements.txt
7 |
8 | COPY index.html /app
9 | COPY main.py /app
10 |
11 | RUN chown -R nobody:nogroup /app
12 |
13 | USER nobody
14 |
15 | CMD [ "python", "./main.py" ]
16 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
test
3 |
4 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # html2pptx
2 |
3 | # How to use
4 |
5 | This program requires Python 3 in order to work
6 | Install all requirements from `requirements.txt` by doing `pip install -r requirements.txt`
7 |
8 | Once everything is installed you can run html2pptx by executing the `main.py` file
9 |
10 | You should be able to access the `index.html` page at the address where the server is located
11 |
12 | The page expects two arguments, one URL and one CSS selector
13 | When you click on the submit button, html2pptx will get the URL contents and extract the HTML
14 | elements targeted by the CSS selector.
15 | The HTML element targeted must be a parent element with multiple children elements as siblings:
16 |
17 | ```html
18 |
19 |
...
20 |
...
21 |
...
22 |
...
23 |
...
24 |
25 | ```
26 |
27 | Every direct child element of the parent element will become a slide and its contents will be
28 | all the children of the direct child element
29 | As for the contents and the layout, the service will try its best to get all images and text from the page
30 | and rearrange them with a slide layout depending on various elements (number of images, text length, ...)
31 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import http.server
4 | import io
5 | import os
6 | import re
7 | import requests
8 | import urllib.parse
9 | from bs4 import BeautifulSoup
10 | from pptx import Presentation
11 | from pptx.dml.color import RGBColor
12 | from pptx.util import Inches, Pt
13 | from pptx.enum.text import MSO_VERTICAL_ANCHOR, MSO_AUTO_SIZE, PP_PARAGRAPH_ALIGNMENT
14 |
15 | SHORT_TEXT_LIMIT_CHARS = 75
16 | TITLE_FONT_PT = 75
17 | SLIDE_BLANK_LAYOUT = 6
18 | SLIDE_WIDTH_INCHES = 10
19 | SLIDE_HEIGHT_INCHES = 7.5
20 | SLIDE_SMALL_MARGIN_INCHES = 0.25
21 | COLUMN_MARGIN_INCHES = 0.1
22 | HEIGHT_MARGIN_INCHES = 0.1
23 |
24 | # Init configuration default values
25 | debug_logs = False
26 | debug_slides = False
27 | server_port = 8080
28 |
29 | # Parse configuration environment variables
30 | html2pptx_debug_logs = os.getenv('HTML2PPTX_DEBUG_LOGS', "false")
31 | html2pptx_debug_slides = os.getenv('HTML2PPTX_DEBUG_SLIDES', "false")
32 | html2pptx_server_port = os.getenv('HTML2PPTX_PORT', "8080")
33 |
34 | # Set configuration variables depending on environment variables
35 | if html2pptx_debug_logs.lower() == "true":
36 | debug_logs = True
37 | if html2pptx_debug_slides.lower() == "true":
38 | debug_slides = True
39 | server_port = int(html2pptx_server_port)
40 |
41 | # Print configuration
42 | print("Configuration:")
43 | print("debug_logs:", debug_logs)
44 | print("debug_slides:", debug_slides)
45 | print("server_port:", server_port)
46 |
47 |
48 | def html_to_pptx(url, css_selector):
49 | r = requests.get(url)
50 | url_string = r.text
51 | slides = html_to_slides(url_string, css_selector)
52 | prs_bytes_stream = slides_to_pptx(slides)
53 | return prs_bytes_stream
54 |
55 |
56 | def html_to_slides(html_string, css_selector):
57 | soup = BeautifulSoup(html_string, 'html.parser')
58 | useful_content = soup.select(css_selector)
59 | slides = []
60 | for parent_content_tag in useful_content[0].children:
61 | if parent_content_tag.name is not None:
62 | slide_content = html_to_slide(parent_content_tag)
63 | slides.append(slide_content)
64 | return slides
65 |
66 |
67 | def html_to_slide(parent_tag):
68 | return parse_tag_contents(parent_tag)
69 |
70 |
71 | def parse_tag_contents(tag):
72 | tag_data = []
73 | for children_content_tag in tag.children:
74 | # Go through all children tags
75 | if children_content_tag.name is not None:
76 | # Just handle valid tags
77 | if children_content_tag.name == "img":
78 | # If we have an image, get the "src" link
79 | tag_data.append("img_src:" + children_content_tag["src"])
80 | elif children_content_tag.string is not None:
81 | # If we have only one string, return it
82 | if children_content_tag.string.strip() != "":
83 | tag_data.append(children_content_tag.string.strip())
84 | else:
85 | # Get direct text elements from tag even if there are children elements with text inside
86 | # (but do not get the text from the children)
87 | direct_tag_strings = children_content_tag.find_all(string=True, recursive=False)
88 | sanitized_direct_tag_strings = []
89 | for string in direct_tag_strings:
90 | sanitized_string = string.strip()
91 | if re.match("^\\[if mso \\| IE\\].*", sanitized_string):
92 | sanitized_string = ""
93 | if sanitized_string != "":
94 | sanitized_direct_tag_strings.append(sanitized_string)
95 |
96 | # Get direct text elements from tag even if there are children elements with text inside
97 | # (this time, we get the text from the children)
98 | recursive_tag_strings = children_content_tag.find_all(string=True, recursive=True)
99 | sanitized_recursive_tag_strings = []
100 | for string in recursive_tag_strings:
101 | sanitized_string = string.strip()
102 | if re.match("^\\[if mso \\| IE\\].*", sanitized_string):
103 | sanitized_string = ""
104 | if sanitized_string != "":
105 | sanitized_recursive_tag_strings.append(sanitized_string)
106 |
107 | # If we have some direct text elements, then we are in a case of some formatted text nested within other
108 | # text tags, then just extract the whole text (direct and from children), return it,
109 | # and stop the recursion by going directly to the next element
110 | if len(sanitized_direct_tag_strings) > 0:
111 | tag_data.append(" ".join(sanitized_recursive_tag_strings))
112 | continue
113 |
114 | # If we are not in the case of nested text, just do a recursive call
115 | # to get the contents of the children tag
116 | tag_data.extend(parse_tag_contents(children_content_tag))
117 | return tag_data
118 |
119 |
120 | def slides_to_pptx(slides):
121 | prs = Presentation()
122 | for slide in slides:
123 | if debug_logs:
124 | print("============================================ NEW SLIDE ============================================")
125 | fill_slide(prs, slide)
126 | if debug_logs:
127 | print("============================================ END SLIDE ============================================")
128 | prs_bytes_stream = io.BytesIO()
129 | prs.save(prs_bytes_stream)
130 |
131 | # Test to save bytes stream directly to file
132 | # Uncomment if needed
133 | # with open("test.pptx", 'wb') as out:
134 | # out.write(prs_bytes_stream.getvalue())
135 | return prs_bytes_stream
136 |
137 |
138 | def fill_slide(prs, slide):
139 | # Init default count values
140 | image_count = 0
141 | max_chars_in_strings = 0
142 |
143 | # Determine number of images and max string length
144 | for slide_data in slide:
145 | img_found = re.search("^img_src:(.*)$", slide_data)
146 | if img_found:
147 | image_count += 1
148 | else:
149 | if len(slide_data) > max_chars_in_strings:
150 | max_chars_in_strings = len(slide_data)
151 |
152 | # Determine if the slide is empty (no images and, max string length = 0, wich means no images and no text)
153 | empty_slide = image_count == 0 and max_chars_in_strings == 0
154 | if empty_slide:
155 | return
156 |
157 | # Add a slide when not empty
158 | prs_slide_layout = prs.slide_layouts[SLIDE_BLANK_LAYOUT]
159 | prs_slide = prs.slides.add_slide(prs_slide_layout)
160 |
161 | # Determine if we are in a column layout or not
162 | # We have a column layout if we have more than 1 image, at least 1 text block
163 | # and if the longest text is < SHORT_TEXT_LIMIT_CHARS chars
164 | with_multiple_images = image_count > 1
165 | with_short_texts = max_chars_in_strings != 0 and max_chars_in_strings <= SHORT_TEXT_LIMIT_CHARS
166 | column_layout = with_multiple_images and with_short_texts
167 |
168 | # Init base data
169 | images_array = []
170 | text_array = []
171 | column_text_array = []
172 |
173 | # Parse slide and separate images from text
174 | for slide_data in slide:
175 | image_found = re.search("^img_src:(.*)$", slide_data)
176 | if column_layout:
177 | # Special handling of text if we are in a column layout
178 | if image_found:
179 | # Be sure to always associate an image with some text below
180 | if len(images_array) == 0 and len(column_text_array) > 0:
181 | # If text has been found without an image at the beginning, add an "empty" placeholder image
182 | images_array.append("empty")
183 | text_array.append(column_text_array)
184 | column_text_array = []
185 | elif len(images_array) > 0:
186 | # When some images have already been found, just add associated text
187 | # to the previous image (empty or not)
188 | text_array.append(column_text_array)
189 | column_text_array = []
190 | images_array.append(image_found.group(1))
191 | else:
192 | # Add text to the current column
193 | column_text_array.append(slide_data)
194 | else:
195 | # Default handling when not in column layout
196 | if image_found:
197 | images_array.append(image_found.group(1))
198 | else:
199 | column_text_array.append(slide_data)
200 |
201 | if column_layout:
202 | # Always add the final text (empty or not) in column layout
203 | text_array.append(column_text_array)
204 | else:
205 | # Add text only if some text has been found (there will only be one big column in this case)
206 | if len(column_text_array) > 0:
207 | text_array.append(column_text_array)
208 |
209 | # Determine available space in slide and column layout
210 | available_slide_width_inches = SLIDE_WIDTH_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES \
211 | - (len(images_array)-1)*COLUMN_MARGIN_INCHES
212 | column_width_inches = available_slide_width_inches
213 | if len(images_array) > 0:
214 | column_width_inches = available_slide_width_inches/len(images_array)
215 |
216 | # Init default values to compute image heights
217 | max_image_height_inches = 0
218 | images_heights_inches = []
219 |
220 | for index, image_link in enumerate(images_array):
221 | if debug_logs:
222 | print("IMAGE LINK:", image_link)
223 |
224 | if image_link == "empty":
225 | # If we have an "empty" placeholder image, ignore it but add a height of "0" to the image heights array
226 | images_heights_inches.append(0)
227 | continue
228 |
229 | # Determine image position
230 | top = Inches(SLIDE_SMALL_MARGIN_INCHES)
231 | left = Inches(SLIDE_SMALL_MARGIN_INCHES + index*column_width_inches
232 | + index*COLUMN_MARGIN_INCHES)
233 |
234 | # Download image and add it to the slide
235 | image_req = requests.get(image_link)
236 | image_bytes = io.BytesIO(image_req.content)
237 | image_box = prs_slide.shapes.add_picture(image_bytes, left, top)
238 |
239 | # Determine image ratio and resize image (with aspect ratio preserved) to fit it in the slide if it is too large
240 | ratio = image_box.width.inches / image_box.height.inches
241 | if image_box.width.inches > SLIDE_WIDTH_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES:
242 | image_box.width = Inches(SLIDE_WIDTH_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES)
243 | image_box.height = Inches(image_box.width.inches / ratio)
244 | if image_box.height.inches > SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES:
245 | image_box.height = Inches(SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES)
246 | image_box.width = Inches(image_box.height.inches * ratio)
247 |
248 | # Center image horizontally if only one image is found
249 | if len(images_array) == 1:
250 | horizontal_image_center_inches = image_box.width.inches / 2
251 | slide_horizontal_center_inches = SLIDE_WIDTH_INCHES / 2
252 | left_horizontal_centered_inches = slide_horizontal_center_inches - horizontal_image_center_inches
253 | if left_horizontal_centered_inches < SLIDE_SMALL_MARGIN_INCHES:
254 | left_horizontal_centered_inches = SLIDE_SMALL_MARGIN_INCHES
255 | image_box.left = Inches(left_horizontal_centered_inches)
256 |
257 | # Center image vertically if this one image is alone with no text column
258 | if len(text_array) == 0:
259 | vertical_image_center_inches = image_box.height.inches / 2
260 | slide_vertical_center_inches = SLIDE_HEIGHT_INCHES / 2
261 | top_vertical_centered_inches = slide_vertical_center_inches - vertical_image_center_inches
262 | if top_vertical_centered_inches < SLIDE_SMALL_MARGIN_INCHES:
263 | top_vertical_centered_inches = SLIDE_SMALL_MARGIN_INCHES
264 | image_box.top = Inches(top_vertical_centered_inches)
265 |
266 | images_heights_inches.append(image_box.height.inches)
267 | if image_box.height.inches > max_image_height_inches:
268 | max_image_height_inches = image_box.height.inches
269 |
270 | for index, text_column in enumerate(text_array):
271 | # For every text column, init default position and size
272 | left = Inches(SLIDE_SMALL_MARGIN_INCHES)
273 | width = Inches(SLIDE_WIDTH_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES)
274 | top = Inches(SLIDE_SMALL_MARGIN_INCHES)
275 | height = Inches(SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES)
276 |
277 | if len(images_array) > 0:
278 | # Override some default values if we have images
279 | top = Inches(SLIDE_SMALL_MARGIN_INCHES + max_image_height_inches + HEIGHT_MARGIN_INCHES)
280 | height = Inches(SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES
281 | - max_image_height_inches - HEIGHT_MARGIN_INCHES)
282 |
283 | if column_layout:
284 | # Column layout gets the final override if enabled
285 | left = Inches(SLIDE_SMALL_MARGIN_INCHES + index*column_width_inches
286 | + index*COLUMN_MARGIN_INCHES)
287 | width = Inches(column_width_inches)
288 | top = Inches(SLIDE_SMALL_MARGIN_INCHES + images_heights_inches[index] + HEIGHT_MARGIN_INCHES)
289 | height = Inches(SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES
290 | - images_heights_inches[index] - HEIGHT_MARGIN_INCHES)
291 |
292 | # Create the text box
293 | text_box = prs_slide.shapes.add_textbox(left, top, width, height)
294 |
295 | if debug_slides:
296 | # Fill the shape with red in debug mode
297 | fill = text_box.fill
298 | fill.solid()
299 | fill.fore_color.rgb = RGBColor(255, 0, 0)
300 |
301 | # Create the text frame inside the text box and configure it
302 | text_frame = text_box.text_frame
303 | text_frame.clear()
304 | text_frame.vertical_anchor = MSO_VERTICAL_ANCHOR.MIDDLE
305 | if column_layout:
306 | # Change vertical anchor in column layout mode
307 | text_frame.vertical_anchor = MSO_VERTICAL_ANCHOR.TOP
308 | text_frame.word_wrap = True
309 | text_frame.auto_size = MSO_AUTO_SIZE.TEXT_TO_FIT_SHAPE
310 |
311 | # Add every string in the column to the text frame
312 | for text in text_column:
313 | # Determine if the text is a title (no images + only this text alone)
314 | is_title = len(images_array) == 0 and len(text_array) == 1 and len(text_column) == 1
315 |
316 | if debug_logs:
317 | print(text)
318 |
319 | # Fill the text frame
320 | paragraph = text_frame.paragraphs[0]
321 | if paragraph.text == "":
322 | if is_title:
323 | # title format
324 | paragraph.alignment = PP_PARAGRAPH_ALIGNMENT.CENTER
325 | paragraph.font.size = Pt(TITLE_FONT_PT)
326 | paragraph.text = text
327 | else:
328 | paragraph = text_frame.add_paragraph()
329 | if is_title:
330 | # title format
331 | paragraph.alignment = PP_PARAGRAPH_ALIGNMENT.CENTER
332 | paragraph.font.size = Pt(TITLE_FONT_PT)
333 | paragraph.text = text
334 |
335 |
336 | class Html2pptx(http.server.BaseHTTPRequestHandler):
337 | def do_GET(self):
338 | # Handle GET requests
339 |
340 | # Set headers
341 | self.send_response(200)
342 | self.send_header("Content-type", "text/html")
343 | self.end_headers()
344 |
345 | # Send index.html page
346 | with open("index.html", 'rb') as out:
347 | self.wfile.write(out.read())
348 |
349 | def do_POST(self):
350 | # Handle POST requests
351 |
352 | # Retrieve and decode POST query data
353 | content_length = int(self.headers['Content-Length'])
354 | post_data = self.rfile.read(content_length)
355 | decoded_post_data = urllib.parse.parse_qs(post_data.decode("utf-8"))
356 | if debug_logs:
357 | print("decoded_post_data[\"url\"][0]", decoded_post_data["url"][0])
358 | print("decoded_post_data[\"selector\"][0]", decoded_post_data["selector"][0])
359 |
360 | # Translate HTML to PPTX, retrieves presentation bytes stream
361 | prs_bytes_stream = html_to_pptx(decoded_post_data["url"][0], decoded_post_data["selector"][0])
362 |
363 | # Set headers to download the PPTX file
364 | self.send_response(200)
365 | self.send_header("Content-Type", 'application/octet-stream')
366 | self.send_header("Content-Disposition", 'attachment; filename="presentation.pptx"')
367 | # This is some unused example code which uses content-length header when transferring a file
368 | # Since it seems to work here, we won't use it, but the code below will stay
369 | # here in case we need to use and modify it
370 | # Source:
371 | # https://stackoverflow.com/questions/18543640/how-would-i-create-a-python-web-server-that-downloads-a-file-on-any-get-request
372 | # fs = os.fstat(f.fileno())
373 | # self.send_header("Content-Length", str(fs.st_size))
374 | self.end_headers()
375 |
376 | # Send the PPTX presentation
377 | # Use getvalue() instead of read() with BytesIO to avoid problems
378 | # Source:
379 | # https://stackoverflow.com/questions/46981529/why-does-saving-a-presentation-to-a-file-like-object-produce-a-blank-presentatio?noredirect=1&lq=1
380 | self.wfile.write(prs_bytes_stream.getvalue())
381 |
382 |
383 | # Setup and start HTTP server with custom Html2pptx handler
384 | server_address = ("", server_port)
385 | httpd = http.server.HTTPServer(server_address, Html2pptx)
386 | print("Serving at port:", server_port)
387 | httpd.serve_forever()
388 |
--------------------------------------------------------------------------------