├── .gitignore ├── main.py ├── src ├── url_discovery.py ├── pdf_merger.py ├── utils.py └── pdf_generator.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | Apple-HIGs/ 3 | __pycache__/ 4 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from src.url_discovery import get_article_urls 2 | from src.pdf_generator import generate_pdfs 3 | from src.pdf_merger import merge_pdfs 4 | 5 | def main(): 6 | articles = get_article_urls() 7 | print(f"Found {len(articles)} articles") 8 | 9 | print("Starting PDF generation for articles...") 10 | output_folder, generated_files, sections_info = generate_pdfs(articles) 11 | 12 | print("\nStarting PDF merge process...") 13 | final_pdf = merge_pdfs(output_folder, generated_files, sections_info) 14 | 15 | if final_pdf: 16 | print(f'\n✅ Successfully generated and merged PDFs into: {final_pdf}') 17 | else: 18 | print('\n❌ Failed to merge PDFs') 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /src/url_discovery.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | from urllib.parse import urljoin, urlparse 3 | 4 | def get_article_urls(): 5 | """Get unique article URLs, removing duplicates""" 6 | with sync_playwright() as p: 7 | browser = p.chromium.launch(headless=True) 8 | page = browser.new_page() 9 | 10 | try: 11 | page.goto( 12 | "https://developer.apple.com/design/human-interface-guidelines/", 13 | wait_until="networkidle", 14 | timeout=60000 15 | ) 16 | 17 | if not page.title().startswith("Human Interface Guidelines"): 18 | raise Exception("Failed to load HIG main page") 19 | 20 | links = page.query_selector_all('a[href*="/design/human-interface-guidelines/"]') 21 | print(f"Initial links found: {len(links)}") 22 | 23 | base_url = "https://developer.apple.com" 24 | articles = set() 25 | seen_paths = set() 26 | 27 | for link in links: 28 | href = link.get_attribute("href") 29 | full_url = urljoin(base_url, href) 30 | parsed = urlparse(full_url) 31 | path = parsed.path.rstrip('/') 32 | 33 | if "/design/human-interface-guidelines/" in full_url and path not in seen_paths: 34 | seen_paths.add(path) 35 | articles.add(full_url) 36 | print(f"Added unique URL: {full_url}") 37 | 38 | print(f"Found {len(articles)} unique articles after deduplication") 39 | return sorted(articles) 40 | 41 | finally: 42 | browser.close() 43 | -------------------------------------------------------------------------------- /src/pdf_merger.py: -------------------------------------------------------------------------------- 1 | from PyPDF2 import PdfMerger, PdfReader 2 | import os 3 | 4 | def merge_pdfs(output_dir, generated_files, sections_info): 5 | """Merge PDFs with working bookmarks and internal links""" 6 | merged_output = "Apple HIGs Complete.pdf" 7 | 8 | if not generated_files: 9 | print("No PDFs found to merge.") 10 | return None 11 | 12 | try: 13 | merger = PdfMerger() 14 | 15 | # Add cover page 16 | if os.path.exists(generated_files[0]): 17 | cover_pages = len(PdfReader(generated_files[0]).pages) 18 | merger.append(generated_files[0]) 19 | 20 | # Add index page 21 | if os.path.exists(generated_files[1]): 22 | index_pages = len(PdfReader(generated_files[1]).pages) 23 | merger.append(generated_files[1]) 24 | 25 | # Add content pages with bookmarks using provided page numbers 26 | for idx, pdf_path in enumerate(generated_files[2:], 1): 27 | if os.path.exists(pdf_path): 28 | print(f"Adding: {os.path.basename(pdf_path)}") 29 | try: 30 | section_title, page_number = sections_info[idx-1] 31 | 32 | # Add bookmark with named destination 33 | merger.append( 34 | pdf_path, 35 | outline_item={ 36 | "title": section_title, 37 | "page_number": page_number + index_pages - 1, 38 | "type": "/Fit", 39 | "color": "0,0,0", # Black color for bookmark 40 | "dest": f"section_{idx}" # Named destination for internal linking 41 | } 42 | ) 43 | except Exception as e: 44 | print(f"Error adding {pdf_path}: {str(e)}") 45 | continue 46 | 47 | # Write final merged PDF 48 | merged_path = os.path.join(output_dir, merged_output) 49 | merger.write(merged_path) 50 | merger.close() 51 | 52 | # Clean up individual PDFs 53 | for pdf in generated_files: 54 | try: 55 | if os.path.exists(pdf): 56 | os.remove(pdf) 57 | except Exception as e: 58 | print(f"Error removing {pdf}: {str(e)}") 59 | 60 | return merged_path 61 | 62 | except Exception as e: 63 | print(f"Error during PDF merge: {str(e)}") 64 | return None 65 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apple Human Interface Guidelines - PDF Generator 2 | 3 | This tool automatically scrapes and compiles Apple's Human Interface Guidelines into a comprehensive PDF document, complete with a cover page, table of contents, and bookmarks. 4 | 5 | ## 🚨 Important Notice 6 | 7 | This tool is for **personal use only**. The Apple Human Interface Guidelines are copyrighted material owned by Apple Inc. This script merely facilitates access to publicly available content for personal reference. The generated PDF should not be redistributed or used for commercial purposes. 8 | 9 | ## 🤔 Why Use This Tool 10 | 11 | Having Apple's Human Interface Guidelines available as a single, offline PDF provides several benefits: 12 | 13 | - **Offline Access**: Access the complete HIG documentation during flights, commutes, or in areas with limited internet connectivity 14 | - **Persistent Reference**: Maintain access to a specific version of the guidelines even if the online documentation changes 15 | - **Improved Navigation**: Quickly search across the entire documentation using PDF reader search functions 16 | - **Annotation**: Add personal notes, highlights, and bookmarks directly on the document 17 | 18 | ## ✨ Features 19 | 20 | - Automatically discovers all HIG articles from Apple's developer website 21 | - Generates individual PDFs for each article with proper formatting 22 | - Creates a professional cover page and table of contents 23 | - Merges all PDFs with working bookmarks and internal navigation 24 | - Handles pagination for images and special sections 25 | - Detects and removes duplicate content 26 | - Produces a single, well-structured PDF document 27 | 28 | ## 📋 Requirements 29 | 30 | - Python 3.7+ 31 | - Playwright 32 | - PyPDF2 33 | 34 | ## 🛠️ Installation 35 | 36 | 1. Clone this repository: 37 | ``` 38 | git clone 39 | cd HIGs-PDF 40 | ``` 41 | 42 | 2. Install required dependencies: 43 | ``` 44 | pip install playwright pypdf2 45 | playwright install chromium 46 | ``` 47 | 48 | ## 🚀 Usage 49 | 50 | Run the main script: 51 | 52 | ``` 53 | python main.py 54 | ``` 55 | 56 | The script will: 57 | 1. Discover all HIG articles from Apple's developer website 58 | 2. Generate individual PDFs for each article 59 | 3. Create a cover page and table of contents 60 | 4. Merge everything into a single PDF 61 | 5. Save the final PDF as "Apple HIGs Complete.pdf" in the "Apple-HIGs" directory 62 | 63 | ## ⚠️ Potential Issues and Solutions 64 | 65 | ### Network and Web Scraping Issues 66 | 67 | - **Rate limiting**: The script might be blocked if too many requests are made too quickly. Solution: Add delay between requests or use proxies. 68 | - **Website structure changes**: If Apple updates their website structure, the URL discovery might break. Solution: Update the selectors in `url_discovery.py`. 69 | - **Timeout errors**: Some pages might take too long to load. Solution: Increase timeout values in the code. 70 | 71 | ### PDF Generation Issues 72 | 73 | - **Missing images**: Sometimes images might not load properly. Solution: Increase the wait time for images in `pdf_generator.py`. 74 | - **Rendering inconsistencies**: Different browsers might render content differently. Solution: Adjust the viewport settings or CSS modifications. 75 | - **Memory issues**: Processing many large PDFs can consume significant memory. Solution: Process in smaller batches or increase available memory. 76 | 77 | ### PDF Merging Issues 78 | 79 | - **Bookmark errors**: Incorrect page numbers in bookmarks. Solution: Check the page counting logic in `pdf_merger.py`. 80 | - **Large file size**: The final PDF might be very large. Solution: Adjust the PDF compression settings. 81 | 82 | ## 🔍 Project Structure 83 | 84 | - `main.py` - The entry point script 85 | - `url_discovery.py` - Discovers HIG article URLs 86 | - `pdf_generator.py` - Converts articles to PDFs 87 | - `pdf_merger.py` - Merges PDFs with bookmarks 88 | - `utils.py` - Utility functions for the project 89 | 90 | ## 📝 License 91 | 92 | This project is for personal use only. The Apple Human Interface Guidelines content is copyrighted by Apple Inc. 93 | 94 | ## 🙏 Acknowledgements 95 | 96 | This tool is not affiliated with, authorized, maintained, sponsored, or endorsed by Apple Inc. or any of its affiliates or subsidiaries. 97 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | import hashlib 5 | from datetime import datetime 6 | from PyPDF2 import PdfReader 7 | 8 | def sanitize_filename(text): 9 | """Create safe filenames from titles""" 10 | return re.sub(r'[\\/*?:"<>|]', '', text)[:100].strip() 11 | 12 | def get_unique_filename(output_dir, base_name): 13 | """Create a unique filename using timestamp and hash if needed.""" 14 | base, ext = os.path.splitext(base_name) 15 | timestamp = int(time.time() * 1000) 16 | counter = 0 17 | 18 | while True: 19 | if counter == 0: 20 | filename = f"{base}_{timestamp}{ext}" 21 | else: 22 | filename = f"{base}_{timestamp}_{counter}{ext}" 23 | 24 | filepath = os.path.join(output_dir, filename) 25 | if not os.path.exists(filepath): 26 | return filepath 27 | counter += 1 28 | 29 | def calculate_content_hash(page): 30 | """Calculate hash of page content for duplicate detection""" 31 | content = page.evaluate("""() => { 32 | const main = document.querySelector('main') || document.body; 33 | const clone = main.cloneNode(true); 34 | const dynamics = clone.querySelectorAll('[data-dynamic], .timestamp, time'); 35 | dynamics.forEach(el => el.remove()); 36 | return clone.textContent; 37 | }""") 38 | return hashlib.md5(content.encode()).hexdigest() 39 | 40 | def get_pdf_page_count(pdf_path): 41 | """Get the number of pages in a PDF file""" 42 | try: 43 | with open(pdf_path, 'rb') as file: 44 | reader = PdfReader(file) 45 | return len(reader.pages) 46 | except Exception as e: 47 | print(f"Error getting page count for {pdf_path}: {str(e)}") 48 | return 0 49 | 50 | def create_index_html(sections_info): 51 | """Create index page HTML with Apple-style design""" 52 | items = '\n'.join([ 53 | f'
  • {title}' 54 | f'{page}
  • ' 55 | for idx, (title, page) in enumerate(sections_info, 1) 56 | ]) 57 | 58 | return f""" 59 | 60 | 61 | 105 | 106 | 107 |

    Contents

    108 | 109 | 110 | 111 | """ 112 | 113 | def create_cover_html(): 114 | """Create a minimalist cover page""" 115 | return f""" 116 | 117 | 118 | 119 | 143 | 144 | 145 |
    146 |

    Human Interface Guidelines

    147 |

    A comprehensive guide for designing
    intuitive user experiences

    148 |
    149 | 150 | 151 | """ 152 | -------------------------------------------------------------------------------- /src/pdf_generator.py: -------------------------------------------------------------------------------- 1 | from playwright.sync_api import sync_playwright 2 | import urllib.parse 3 | import os 4 | from src.utils import ( 5 | sanitize_filename, get_unique_filename, calculate_content_hash, 6 | create_index_html, get_pdf_page_count, create_cover_html 7 | ) 8 | 9 | def add_page_break_script(): 10 | """JavaScript to handle image pagination and section breaks""" 11 | return """ 12 | // Handle images and their captions 13 | function wrapImageWithCaption(img) { 14 | const wrapper = document.createElement('div'); 15 | wrapper.style.pageBreakInside = 'avoid'; 16 | wrapper.style.breakInside = 'avoid'; 17 | wrapper.style.margin = '1em 0'; 18 | wrapper.style.display = 'flex'; 19 | wrapper.style.flexDirection = 'column'; 20 | 21 | // Find related caption 22 | let caption = img.closest('figure')?.querySelector('figcaption') || 23 | (img.nextElementSibling?.matches('.caption, [class*="caption"], p[class*="caption"]') 24 | ? img.nextElementSibling : null) || 25 | img.closest('dt')?.nextElementSibling; 26 | 27 | // Get the container that holds both image and caption 28 | const container = img.closest('figure') || img.parentElement; 29 | 30 | if (container) { 31 | container.style.pageBreakInside = 'avoid'; 32 | container.style.breakInside = 'avoid'; 33 | 34 | if (!container.parentElement?.hasAttribute('data-image-wrapper')) { 35 | wrapper.setAttribute('data-image-wrapper', 'true'); 36 | container.parentNode.insertBefore(wrapper, container); 37 | wrapper.appendChild(container); 38 | } 39 | } else { 40 | wrapper.setAttribute('data-image-wrapper', 'true'); 41 | img.parentNode.insertBefore(wrapper, img); 42 | wrapper.appendChild(img); 43 | if (caption) wrapper.appendChild(caption); 44 | } 45 | } 46 | 47 | // Handle Resources and Change Log sections 48 | function handleSpecialSections() { 49 | const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6')); 50 | 51 | const resourcesHeading = headings.find(h => 52 | h.textContent.toLowerCase().includes('resource') || 53 | h.textContent.toLowerCase().includes('related') || 54 | h.textContent.toLowerCase().includes('see also') 55 | ); 56 | 57 | if (resourcesHeading) { 58 | // Create resources wrapper with page break 59 | const wrapper = document.createElement('div'); 60 | wrapper.style.pageBreakBefore = 'always'; 61 | wrapper.style.breakBefore = 'page'; 62 | 63 | // Get all content until next major heading or end 64 | const content = []; 65 | let current = resourcesHeading; 66 | 67 | while (current) { 68 | const next = current.nextElementSibling; 69 | if (next && next.matches('h1')) break; 70 | content.push(current); 71 | current = next; 72 | } 73 | 74 | // Move content to wrapper 75 | resourcesHeading.parentNode.insertBefore(wrapper, resourcesHeading); 76 | content.forEach(node => wrapper.appendChild(node)); 77 | } 78 | } 79 | 80 | // Process images first 81 | document.querySelectorAll('img, [role="img"], svg').forEach(wrapImageWithCaption); 82 | document.querySelectorAll('.graphics-container, [class*="figure"], [class*="image"]').forEach(container => { 83 | container.style.pageBreakInside = 'avoid'; 84 | container.style.breakInside = 'avoid'; 85 | }); 86 | 87 | // Then handle special sections 88 | handleSpecialSections(); 89 | """ 90 | 91 | def generate_pdfs(article_urls): 92 | """Generate PDFs for all articles""" 93 | output_dir = 'Apple-HIGs' 94 | os.makedirs(output_dir, exist_ok=True) 95 | generated_files = [] 96 | titles = [] 97 | page_numbers = [] 98 | content_hashes = set() 99 | current_page = 1 100 | 101 | with sync_playwright() as p: 102 | browser = p.chromium.launch() 103 | context = browser.new_context( 104 | viewport={'width': 1200, 'height': 800}, 105 | forced_colors='none' 106 | ) 107 | 108 | # Generate cover page 109 | cover_page = context.new_page() 110 | cover_html = create_cover_html() 111 | cover_page.set_content(cover_html) 112 | cover_file = os.path.join(output_dir, "_cover.pdf") 113 | cover_page.pdf( 114 | path=cover_file, 115 | format='A4', 116 | print_background=True, 117 | margin={'top': '1cm', 'bottom': '1cm', 'left': '1cm', 'right': '1cm'}, 118 | ) 119 | cover_page.close() 120 | 121 | # Store cover file but don't add to generated_files yet 122 | current_page += get_pdf_page_count(cover_file) 123 | 124 | # Generate PDFs for articles 125 | for idx, url in enumerate(article_urls, 1): 126 | try: 127 | page = context.new_page() 128 | page.goto(url, wait_until='networkidle', timeout=60000) 129 | 130 | content_hash = calculate_content_hash(page) 131 | if content_hash in content_hashes: 132 | print(f"Skipping duplicate content: {url}") 133 | continue 134 | 135 | content_hashes.add(content_hash) 136 | 137 | try: 138 | page.evaluate(add_page_break_script()) 139 | except Exception as e: 140 | print(f"Warning: Could not apply image pagination for {url}: {str(e)}") 141 | 142 | try: 143 | page.wait_for_selector('img', state='attached', timeout=5000) 144 | except: 145 | print(f"Warning: No images found or timeout waiting for images in {url}") 146 | 147 | title_element = page.query_selector('h1') 148 | title = title_element.inner_text() if title_element else 'Untitled' 149 | titles.append(title) 150 | 151 | path_parts = [p for p in urllib.parse.urlparse(url).path.split('/') if p] 152 | section = path_parts[-2] if len(path_parts) > 1 else 'misc' 153 | safe_title = sanitize_filename(f"{section}-{title}") 154 | filepath = get_unique_filename(output_dir, f"{safe_title}.pdf") 155 | 156 | pdf_options = { 157 | 'path': filepath, 158 | 'format': 'A4', 159 | 'print_background': True, 160 | 'margin': {'top': '1cm', 'bottom': '1cm', 'left': '1cm', 'right': '1cm'}, 161 | 'display_header_footer': False 162 | } 163 | 164 | page.pdf(**pdf_options) 165 | 166 | page_count = get_pdf_page_count(filepath) 167 | page_numbers.append(current_page) 168 | current_page += page_count 169 | 170 | generated_files.append(filepath) 171 | print(f'Generated ({idx}/{len(article_urls)}): {os.path.basename(filepath)} - {page_count} pages') 172 | 173 | except Exception as e: 174 | print(f'Failed {url}: {str(e)}') 175 | finally: 176 | page.close() 177 | 178 | # Create index 179 | sections_info = list(zip(titles, page_numbers)) 180 | index_html = create_index_html(sections_info) 181 | index_file = os.path.join(output_dir, "_index.pdf") 182 | 183 | index_page = context.new_page() 184 | index_page.set_content(index_html) 185 | 186 | pdf_options = { 187 | 'path': index_file, 188 | 'format': 'A4', 189 | 'print_background': True, 190 | 'margin': {'top': '1cm', 'bottom': '1cm', 'left': '1cm', 'right': '1cm'} 191 | } 192 | 193 | index_page.pdf(**pdf_options) 194 | index_page.close() 195 | 196 | # Add files in the correct order: cover, index, content 197 | generated_files = [cover_file, index_file] + generated_files 198 | browser.close() 199 | 200 | # Return tuple directly instead of list of tuples 201 | return (output_dir, generated_files, sections_info) 202 | --------------------------------------------------------------------------------