test

├── .gitignore ├── .env.sample ├── requirements.txt ├── Dockerfile ├── index.html ├── README.md └── main.py /.gitignore: -------------------------------------------------------------------------------- 1 | .env* 2 | !.env.sample 3 | -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | export HTML2PPTX_DEBUG_LOGS="False" 4 | export HTML2PPTX_DEBUG_SLIDES="False" 5 | export HTML2PPTX_PORT="8080" 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.7.1 2 | certifi==2019.3.9 3 | chardet==3.0.4 4 | idna==2.8 5 | lxml==4.3.3 6 | Pillow==6.2.0 7 | python-pptx==0.6.18 8 | requests==2.22.0 9 | soupsieve==1.9.1 10 | urllib3==1.25.3 11 | XlsxWriter==1.1.8 12 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3 2 | 3 | WORKDIR /app 4 | 5 | COPY requirements.txt /app 6 | RUN pip install --no-cache-dir -r requirements.txt 7 | 8 | COPY index.html /app 9 | COPY main.py /app 10 | 11 | RUN chown -R nobody:nogroup /app 12 | 13 | USER nobody 14 | 15 | CMD [ "python", "./main.py" ] 16 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | test 3 | 4 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # html2pptx 2 | 3 | # How to use 4 | 5 | This program requires Python 3 in order to work 6 | Install all requirements from `requirements.txt` by doing `pip install -r requirements.txt` 7 | 8 | Once everything is installed you can run html2pptx by executing the `main.py` file 9 | 10 | You should be able to access the `index.html` page at the address where the server is located 11 | 12 | The page expects two arguments, one URL and one CSS selector 13 | When you click on the submit button, html2pptx will get the URL contents and extract the HTML 14 | elements targeted by the CSS selector. 15 | The HTML element targeted must be a parent element with multiple children elements as siblings: 16 | 17 | ```html 18 |

19 |

...

20 |

...

21 |

...

22 |

...

23 |

...

24 |

25 | ``` 26 | 27 | Every direct child element of the parent element will become a slide and its contents will be 28 | all the children of the direct child element 29 | As for the contents and the layout, the service will try its best to get all images and text from the page 30 | and rearrange them with a slide layout depending on various elements (number of images, text length, ...) 31 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import http.server 4 | import io 5 | import os 6 | import re 7 | import requests 8 | import urllib.parse 9 | from bs4 import BeautifulSoup 10 | from pptx import Presentation 11 | from pptx.dml.color import RGBColor 12 | from pptx.util import Inches, Pt 13 | from pptx.enum.text import MSO_VERTICAL_ANCHOR, MSO_AUTO_SIZE, PP_PARAGRAPH_ALIGNMENT 14 | 15 | SHORT_TEXT_LIMIT_CHARS = 75 16 | TITLE_FONT_PT = 75 17 | SLIDE_BLANK_LAYOUT = 6 18 | SLIDE_WIDTH_INCHES = 10 19 | SLIDE_HEIGHT_INCHES = 7.5 20 | SLIDE_SMALL_MARGIN_INCHES = 0.25 21 | COLUMN_MARGIN_INCHES = 0.1 22 | HEIGHT_MARGIN_INCHES = 0.1 23 | 24 | # Init configuration default values 25 | debug_logs = False 26 | debug_slides = False 27 | server_port = 8080 28 | 29 | # Parse configuration environment variables 30 | html2pptx_debug_logs = os.getenv('HTML2PPTX_DEBUG_LOGS', "false") 31 | html2pptx_debug_slides = os.getenv('HTML2PPTX_DEBUG_SLIDES', "false") 32 | html2pptx_server_port = os.getenv('HTML2PPTX_PORT', "8080") 33 | 34 | # Set configuration variables depending on environment variables 35 | if html2pptx_debug_logs.lower() == "true": 36 | debug_logs = True 37 | if html2pptx_debug_slides.lower() == "true": 38 | debug_slides = True 39 | server_port = int(html2pptx_server_port) 40 | 41 | # Print configuration 42 | print("Configuration:") 43 | print("debug_logs:", debug_logs) 44 | print("debug_slides:", debug_slides) 45 | print("server_port:", server_port) 46 | 47 | 48 | def html_to_pptx(url, css_selector): 49 | r = requests.get(url) 50 | url_string = r.text 51 | slides = html_to_slides(url_string, css_selector) 52 | prs_bytes_stream = slides_to_pptx(slides) 53 | return prs_bytes_stream 54 | 55 | 56 | def html_to_slides(html_string, css_selector): 57 | soup = BeautifulSoup(html_string, 'html.parser') 58 | useful_content = soup.select(css_selector) 59 | slides = [] 60 | for parent_content_tag in useful_content[0].children: 61 | if parent_content_tag.name is not None: 62 | slide_content = html_to_slide(parent_content_tag) 63 | slides.append(slide_content) 64 | return slides 65 | 66 | 67 | def html_to_slide(parent_tag): 68 | return parse_tag_contents(parent_tag) 69 | 70 | 71 | def parse_tag_contents(tag): 72 | tag_data = [] 73 | for children_content_tag in tag.children: 74 | # Go through all children tags 75 | if children_content_tag.name is not None: 76 | # Just handle valid tags 77 | if children_content_tag.name == "img": 78 | # If we have an image, get the "src" link 79 | tag_data.append("img_src:" + children_content_tag["src"]) 80 | elif children_content_tag.string is not None: 81 | # If we have only one string, return it 82 | if children_content_tag.string.strip() != "": 83 | tag_data.append(children_content_tag.string.strip()) 84 | else: 85 | # Get direct text elements from tag even if there are children elements with text inside 86 | # (but do not get the text from the children) 87 | direct_tag_strings = children_content_tag.find_all(string=True, recursive=False) 88 | sanitized_direct_tag_strings = [] 89 | for string in direct_tag_strings: 90 | sanitized_string = string.strip() 91 | if re.match("^\\[if mso \\| IE\\].*", sanitized_string): 92 | sanitized_string = "" 93 | if sanitized_string != "": 94 | sanitized_direct_tag_strings.append(sanitized_string) 95 | 96 | # Get direct text elements from tag even if there are children elements with text inside 97 | # (this time, we get the text from the children) 98 | recursive_tag_strings = children_content_tag.find_all(string=True, recursive=True) 99 | sanitized_recursive_tag_strings = [] 100 | for string in recursive_tag_strings: 101 | sanitized_string = string.strip() 102 | if re.match("^\\[if mso \\| IE\\].*", sanitized_string): 103 | sanitized_string = "" 104 | if sanitized_string != "": 105 | sanitized_recursive_tag_strings.append(sanitized_string) 106 | 107 | # If we have some direct text elements, then we are in a case of some formatted text nested within other 108 | # text tags, then just extract the whole text (direct and from children), return it, 109 | # and stop the recursion by going directly to the next element 110 | if len(sanitized_direct_tag_strings) > 0: 111 | tag_data.append(" ".join(sanitized_recursive_tag_strings)) 112 | continue 113 | 114 | # If we are not in the case of nested text, just do a recursive call 115 | # to get the contents of the children tag 116 | tag_data.extend(parse_tag_contents(children_content_tag)) 117 | return tag_data 118 | 119 | 120 | def slides_to_pptx(slides): 121 | prs = Presentation() 122 | for slide in slides: 123 | if debug_logs: 124 | print("============================================ NEW SLIDE ============================================") 125 | fill_slide(prs, slide) 126 | if debug_logs: 127 | print("============================================ END SLIDE ============================================") 128 | prs_bytes_stream = io.BytesIO() 129 | prs.save(prs_bytes_stream) 130 | 131 | # Test to save bytes stream directly to file 132 | # Uncomment if needed 133 | # with open("test.pptx", 'wb') as out: 134 | # out.write(prs_bytes_stream.getvalue()) 135 | return prs_bytes_stream 136 | 137 | 138 | def fill_slide(prs, slide): 139 | # Init default count values 140 | image_count = 0 141 | max_chars_in_strings = 0 142 | 143 | # Determine number of images and max string length 144 | for slide_data in slide: 145 | img_found = re.search("^img_src:(.*)$", slide_data) 146 | if img_found: 147 | image_count += 1 148 | else: 149 | if len(slide_data) > max_chars_in_strings: 150 | max_chars_in_strings = len(slide_data) 151 | 152 | # Determine if the slide is empty (no images and, max string length = 0, wich means no images and no text) 153 | empty_slide = image_count == 0 and max_chars_in_strings == 0 154 | if empty_slide: 155 | return 156 | 157 | # Add a slide when not empty 158 | prs_slide_layout = prs.slide_layouts[SLIDE_BLANK_LAYOUT] 159 | prs_slide = prs.slides.add_slide(prs_slide_layout) 160 | 161 | # Determine if we are in a column layout or not 162 | # We have a column layout if we have more than 1 image, at least 1 text block 163 | # and if the longest text is < SHORT_TEXT_LIMIT_CHARS chars 164 | with_multiple_images = image_count > 1 165 | with_short_texts = max_chars_in_strings != 0 and max_chars_in_strings <= SHORT_TEXT_LIMIT_CHARS 166 | column_layout = with_multiple_images and with_short_texts 167 | 168 | # Init base data 169 | images_array = [] 170 | text_array = [] 171 | column_text_array = [] 172 | 173 | # Parse slide and separate images from text 174 | for slide_data in slide: 175 | image_found = re.search("^img_src:(.*)$", slide_data) 176 | if column_layout: 177 | # Special handling of text if we are in a column layout 178 | if image_found: 179 | # Be sure to always associate an image with some text below 180 | if len(images_array) == 0 and len(column_text_array) > 0: 181 | # If text has been found without an image at the beginning, add an "empty" placeholder image 182 | images_array.append("empty") 183 | text_array.append(column_text_array) 184 | column_text_array = [] 185 | elif len(images_array) > 0: 186 | # When some images have already been found, just add associated text 187 | # to the previous image (empty or not) 188 | text_array.append(column_text_array) 189 | column_text_array = [] 190 | images_array.append(image_found.group(1)) 191 | else: 192 | # Add text to the current column 193 | column_text_array.append(slide_data) 194 | else: 195 | # Default handling when not in column layout 196 | if image_found: 197 | images_array.append(image_found.group(1)) 198 | else: 199 | column_text_array.append(slide_data) 200 | 201 | if column_layout: 202 | # Always add the final text (empty or not) in column layout 203 | text_array.append(column_text_array) 204 | else: 205 | # Add text only if some text has been found (there will only be one big column in this case) 206 | if len(column_text_array) > 0: 207 | text_array.append(column_text_array) 208 | 209 | # Determine available space in slide and column layout 210 | available_slide_width_inches = SLIDE_WIDTH_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES \ 211 | - (len(images_array)-1)*COLUMN_MARGIN_INCHES 212 | column_width_inches = available_slide_width_inches 213 | if len(images_array) > 0: 214 | column_width_inches = available_slide_width_inches/len(images_array) 215 | 216 | # Init default values to compute image heights 217 | max_image_height_inches = 0 218 | images_heights_inches = [] 219 | 220 | for index, image_link in enumerate(images_array): 221 | if debug_logs: 222 | print("IMAGE LINK:", image_link) 223 | 224 | if image_link == "empty": 225 | # If we have an "empty" placeholder image, ignore it but add a height of "0" to the image heights array 226 | images_heights_inches.append(0) 227 | continue 228 | 229 | # Determine image position 230 | top = Inches(SLIDE_SMALL_MARGIN_INCHES) 231 | left = Inches(SLIDE_SMALL_MARGIN_INCHES + index*column_width_inches 232 | + index*COLUMN_MARGIN_INCHES) 233 | 234 | # Download image and add it to the slide 235 | image_req = requests.get(image_link) 236 | image_bytes = io.BytesIO(image_req.content) 237 | image_box = prs_slide.shapes.add_picture(image_bytes, left, top) 238 | 239 | # Determine image ratio and resize image (with aspect ratio preserved) to fit it in the slide if it is too large 240 | ratio = image_box.width.inches / image_box.height.inches 241 | if image_box.width.inches > SLIDE_WIDTH_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES: 242 | image_box.width = Inches(SLIDE_WIDTH_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES) 243 | image_box.height = Inches(image_box.width.inches / ratio) 244 | if image_box.height.inches > SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES: 245 | image_box.height = Inches(SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES) 246 | image_box.width = Inches(image_box.height.inches * ratio) 247 | 248 | # Center image horizontally if only one image is found 249 | if len(images_array) == 1: 250 | horizontal_image_center_inches = image_box.width.inches / 2 251 | slide_horizontal_center_inches = SLIDE_WIDTH_INCHES / 2 252 | left_horizontal_centered_inches = slide_horizontal_center_inches - horizontal_image_center_inches 253 | if left_horizontal_centered_inches < SLIDE_SMALL_MARGIN_INCHES: 254 | left_horizontal_centered_inches = SLIDE_SMALL_MARGIN_INCHES 255 | image_box.left = Inches(left_horizontal_centered_inches) 256 | 257 | # Center image vertically if this one image is alone with no text column 258 | if len(text_array) == 0: 259 | vertical_image_center_inches = image_box.height.inches / 2 260 | slide_vertical_center_inches = SLIDE_HEIGHT_INCHES / 2 261 | top_vertical_centered_inches = slide_vertical_center_inches - vertical_image_center_inches 262 | if top_vertical_centered_inches < SLIDE_SMALL_MARGIN_INCHES: 263 | top_vertical_centered_inches = SLIDE_SMALL_MARGIN_INCHES 264 | image_box.top = Inches(top_vertical_centered_inches) 265 | 266 | images_heights_inches.append(image_box.height.inches) 267 | if image_box.height.inches > max_image_height_inches: 268 | max_image_height_inches = image_box.height.inches 269 | 270 | for index, text_column in enumerate(text_array): 271 | # For every text column, init default position and size 272 | left = Inches(SLIDE_SMALL_MARGIN_INCHES) 273 | width = Inches(SLIDE_WIDTH_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES) 274 | top = Inches(SLIDE_SMALL_MARGIN_INCHES) 275 | height = Inches(SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES) 276 | 277 | if len(images_array) > 0: 278 | # Override some default values if we have images 279 | top = Inches(SLIDE_SMALL_MARGIN_INCHES + max_image_height_inches + HEIGHT_MARGIN_INCHES) 280 | height = Inches(SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES 281 | - max_image_height_inches - HEIGHT_MARGIN_INCHES) 282 | 283 | if column_layout: 284 | # Column layout gets the final override if enabled 285 | left = Inches(SLIDE_SMALL_MARGIN_INCHES + index*column_width_inches 286 | + index*COLUMN_MARGIN_INCHES) 287 | width = Inches(column_width_inches) 288 | top = Inches(SLIDE_SMALL_MARGIN_INCHES + images_heights_inches[index] + HEIGHT_MARGIN_INCHES) 289 | height = Inches(SLIDE_HEIGHT_INCHES - 2*SLIDE_SMALL_MARGIN_INCHES 290 | - images_heights_inches[index] - HEIGHT_MARGIN_INCHES) 291 | 292 | # Create the text box 293 | text_box = prs_slide.shapes.add_textbox(left, top, width, height) 294 | 295 | if debug_slides: 296 | # Fill the shape with red in debug mode 297 | fill = text_box.fill 298 | fill.solid() 299 | fill.fore_color.rgb = RGBColor(255, 0, 0) 300 | 301 | # Create the text frame inside the text box and configure it 302 | text_frame = text_box.text_frame 303 | text_frame.clear() 304 | text_frame.vertical_anchor = MSO_VERTICAL_ANCHOR.MIDDLE 305 | if column_layout: 306 | # Change vertical anchor in column layout mode 307 | text_frame.vertical_anchor = MSO_VERTICAL_ANCHOR.TOP 308 | text_frame.word_wrap = True 309 | text_frame.auto_size = MSO_AUTO_SIZE.TEXT_TO_FIT_SHAPE 310 | 311 | # Add every string in the column to the text frame 312 | for text in text_column: 313 | # Determine if the text is a title (no images + only this text alone) 314 | is_title = len(images_array) == 0 and len(text_array) == 1 and len(text_column) == 1 315 | 316 | if debug_logs: 317 | print(text) 318 | 319 | # Fill the text frame 320 | paragraph = text_frame.paragraphs[0] 321 | if paragraph.text == "": 322 | if is_title: 323 | # title format 324 | paragraph.alignment = PP_PARAGRAPH_ALIGNMENT.CENTER 325 | paragraph.font.size = Pt(TITLE_FONT_PT) 326 | paragraph.text = text 327 | else: 328 | paragraph = text_frame.add_paragraph() 329 | if is_title: 330 | # title format 331 | paragraph.alignment = PP_PARAGRAPH_ALIGNMENT.CENTER 332 | paragraph.font.size = Pt(TITLE_FONT_PT) 333 | paragraph.text = text 334 | 335 | 336 | class Html2pptx(http.server.BaseHTTPRequestHandler): 337 | def do_GET(self): 338 | # Handle GET requests 339 | 340 | # Set headers 341 | self.send_response(200) 342 | self.send_header("Content-type", "text/html") 343 | self.end_headers() 344 | 345 | # Send index.html page 346 | with open("index.html", 'rb') as out: 347 | self.wfile.write(out.read()) 348 | 349 | def do_POST(self): 350 | # Handle POST requests 351 | 352 | # Retrieve and decode POST query data 353 | content_length = int(self.headers['Content-Length']) 354 | post_data = self.rfile.read(content_length) 355 | decoded_post_data = urllib.parse.parse_qs(post_data.decode("utf-8")) 356 | if debug_logs: 357 | print("decoded_post_data[\"url\"][0]", decoded_post_data["url"][0]) 358 | print("decoded_post_data[\"selector\"][0]", decoded_post_data["selector"][0]) 359 | 360 | # Translate HTML to PPTX, retrieves presentation bytes stream 361 | prs_bytes_stream = html_to_pptx(decoded_post_data["url"][0], decoded_post_data["selector"][0]) 362 | 363 | # Set headers to download the PPTX file 364 | self.send_response(200) 365 | self.send_header("Content-Type", 'application/octet-stream') 366 | self.send_header("Content-Disposition", 'attachment; filename="presentation.pptx"') 367 | # This is some unused example code which uses content-length header when transferring a file 368 | # Since it seems to work here, we won't use it, but the code below will stay 369 | # here in case we need to use and modify it 370 | # Source: 371 | # https://stackoverflow.com/questions/18543640/how-would-i-create-a-python-web-server-that-downloads-a-file-on-any-get-request 372 | # fs = os.fstat(f.fileno()) 373 | # self.send_header("Content-Length", str(fs.st_size)) 374 | self.end_headers() 375 | 376 | # Send the PPTX presentation 377 | # Use getvalue() instead of read() with BytesIO to avoid problems 378 | # Source: 379 | # https://stackoverflow.com/questions/46981529/why-does-saving-a-presentation-to-a-file-like-object-produce-a-blank-presentatio?noredirect=1&lq=1 380 | self.wfile.write(prs_bytes_stream.getvalue()) 381 | 382 | 383 | # Setup and start HTTP server with custom Html2pptx handler 384 | server_address = ("", server_port) 385 | httpd = http.server.HTTPServer(server_address, Html2pptx) 386 | print("Serving at port:", server_port) 387 | httpd.serve_forever() 388 | --------------------------------------------------------------------------------