├── .gitignore
├── main.py
├── src
    ├── url_discovery.py
    ├── pdf_merger.py
    ├── utils.py
    └── pdf_generator.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | Apple-HIGs/
3 | __pycache__/
4 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from src.url_discovery import get_article_urls
 2 | from src.pdf_generator import generate_pdfs
 3 | from src.pdf_merger import merge_pdfs
 4 | 
 5 | def main():
 6 |     articles = get_article_urls()
 7 |     print(f"Found {len(articles)} articles")
 8 |     
 9 |     print("Starting PDF generation for articles...")
10 |     output_folder, generated_files, sections_info = generate_pdfs(articles)
11 |     
12 |     print("\nStarting PDF merge process...")
13 |     final_pdf = merge_pdfs(output_folder, generated_files, sections_info)
14 |     
15 |     if final_pdf:
16 |         print(f'\n✅ Successfully generated and merged PDFs into: {final_pdf}')
17 |     else:
18 |         print('\n❌ Failed to merge PDFs')
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/src/url_discovery.py:
--------------------------------------------------------------------------------
 1 | from playwright.sync_api import sync_playwright
 2 | from urllib.parse import urljoin, urlparse
 3 | 
 4 | def get_article_urls():
 5 |     """Get unique article URLs, removing duplicates"""
 6 |     with sync_playwright() as p:
 7 |         browser = p.chromium.launch(headless=True)
 8 |         page = browser.new_page()
 9 |         
10 |         try:
11 |             page.goto(
12 |                 "https://developer.apple.com/design/human-interface-guidelines/",
13 |                 wait_until="networkidle",
14 |                 timeout=60000
15 |             )
16 |             
17 |             if not page.title().startswith("Human Interface Guidelines"):
18 |                 raise Exception("Failed to load HIG main page")
19 |             
20 |             links = page.query_selector_all('a[href*="/design/human-interface-guidelines/"]')
21 |             print(f"Initial links found: {len(links)}")
22 |             
23 |             base_url = "https://developer.apple.com"
24 |             articles = set()
25 |             seen_paths = set()
26 |             
27 |             for link in links:
28 |                 href = link.get_attribute("href")
29 |                 full_url = urljoin(base_url, href)
30 |                 parsed = urlparse(full_url)
31 |                 path = parsed.path.rstrip('/')
32 |                 
33 |                 if "/design/human-interface-guidelines/" in full_url and path not in seen_paths:
34 |                     seen_paths.add(path)
35 |                     articles.add(full_url)
36 |                     print(f"Added unique URL: {full_url}")
37 |             
38 |             print(f"Found {len(articles)} unique articles after deduplication")
39 |             return sorted(articles)
40 |             
41 |         finally:
42 |             browser.close()
43 | 


--------------------------------------------------------------------------------
/src/pdf_merger.py:
--------------------------------------------------------------------------------
 1 | from PyPDF2 import PdfMerger, PdfReader
 2 | import os
 3 | 
 4 | def merge_pdfs(output_dir, generated_files, sections_info):
 5 |     """Merge PDFs with working bookmarks and internal links"""
 6 |     merged_output = "Apple HIGs Complete.pdf"
 7 |     
 8 |     if not generated_files:
 9 |         print("No PDFs found to merge.")
10 |         return None
11 | 
12 |     try:
13 |         merger = PdfMerger()
14 |         
15 |         # Add cover page
16 |         if os.path.exists(generated_files[0]):
17 |             cover_pages = len(PdfReader(generated_files[0]).pages)
18 |             merger.append(generated_files[0])
19 |         
20 |         # Add index page
21 |         if os.path.exists(generated_files[1]):
22 |             index_pages = len(PdfReader(generated_files[1]).pages)
23 |             merger.append(generated_files[1])
24 |         
25 |         # Add content pages with bookmarks using provided page numbers
26 |         for idx, pdf_path in enumerate(generated_files[2:], 1):
27 |             if os.path.exists(pdf_path):
28 |                 print(f"Adding: {os.path.basename(pdf_path)}")
29 |                 try:
30 |                     section_title, page_number = sections_info[idx-1]
31 |                     
32 |                     # Add bookmark with named destination
33 |                     merger.append(
34 |                         pdf_path,
35 |                         outline_item={
36 |                             "title": section_title,
37 |                             "page_number": page_number + index_pages - 1,
38 |                             "type": "/Fit",
39 |                             "color": "0,0,0",  # Black color for bookmark
40 |                             "dest": f"section_{idx}"  # Named destination for internal linking
41 |                         }
42 |                     )
43 |                 except Exception as e:
44 |                     print(f"Error adding {pdf_path}: {str(e)}")
45 |                     continue
46 | 
47 |         # Write final merged PDF
48 |         merged_path = os.path.join(output_dir, merged_output)
49 |         merger.write(merged_path)
50 |         merger.close()
51 | 
52 |         # Clean up individual PDFs
53 |         for pdf in generated_files:
54 |             try:
55 |                 if os.path.exists(pdf):
56 |                     os.remove(pdf)
57 |             except Exception as e:
58 |                 print(f"Error removing {pdf}: {str(e)}")
59 | 
60 |         return merged_path
61 | 
62 |     except Exception as e:
63 |         print(f"Error during PDF merge: {str(e)}")
64 |         return None
65 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apple Human Interface Guidelines - PDF Generator
 2 | 
 3 | This tool automatically scrapes and compiles Apple's Human Interface Guidelines into a comprehensive PDF document, complete with a cover page, table of contents, and bookmarks.
 4 | 
 5 | ## 🚨 Important Notice
 6 | 
 7 | This tool is for **personal use only**. The Apple Human Interface Guidelines are copyrighted material owned by Apple Inc. This script merely facilitates access to publicly available content for personal reference. The generated PDF should not be redistributed or used for commercial purposes.
 8 | 
 9 | ## 🤔 Why Use This Tool
10 | 
11 | Having Apple's Human Interface Guidelines available as a single, offline PDF provides several benefits:
12 | 
13 | - **Offline Access**: Access the complete HIG documentation during flights, commutes, or in areas with limited internet connectivity
14 | - **Persistent Reference**: Maintain access to a specific version of the guidelines even if the online documentation changes
15 | - **Improved Navigation**: Quickly search across the entire documentation using PDF reader search functions
16 | - **Annotation**: Add personal notes, highlights, and bookmarks directly on the document
17 | 
18 | ## ✨ Features
19 | 
20 | - Automatically discovers all HIG articles from Apple's developer website
21 | - Generates individual PDFs for each article with proper formatting
22 | - Creates a professional cover page and table of contents
23 | - Merges all PDFs with working bookmarks and internal navigation
24 | - Handles pagination for images and special sections
25 | - Detects and removes duplicate content
26 | - Produces a single, well-structured PDF document
27 | 
28 | ## 📋 Requirements
29 | 
30 | - Python 3.7+
31 | - Playwright
32 | - PyPDF2
33 | 
34 | ## 🛠️ Installation
35 | 
36 | 1. Clone this repository:
37 |    ```
38 |    git clone <repository-url>
39 |    cd HIGs-PDF
40 |    ```
41 | 
42 | 2. Install required dependencies:
43 |    ```
44 |    pip install playwright pypdf2
45 |    playwright install chromium
46 |    ```
47 | 
48 | ## 🚀 Usage
49 | 
50 | Run the main script:
51 | 
52 | ```
53 | python main.py
54 | ```
55 | 
56 | The script will:
57 | 1. Discover all HIG articles from Apple's developer website
58 | 2. Generate individual PDFs for each article
59 | 3. Create a cover page and table of contents
60 | 4. Merge everything into a single PDF
61 | 5. Save the final PDF as "Apple HIGs Complete.pdf" in the "Apple-HIGs" directory
62 | 
63 | ## ⚠️ Potential Issues and Solutions
64 | 
65 | ### Network and Web Scraping Issues
66 | 
67 | - **Rate limiting**: The script might be blocked if too many requests are made too quickly. Solution: Add delay between requests or use proxies.
68 | - **Website structure changes**: If Apple updates their website structure, the URL discovery might break. Solution: Update the selectors in `url_discovery.py`.
69 | - **Timeout errors**: Some pages might take too long to load. Solution: Increase timeout values in the code.
70 | 
71 | ### PDF Generation Issues
72 | 
73 | - **Missing images**: Sometimes images might not load properly. Solution: Increase the wait time for images in `pdf_generator.py`.
74 | - **Rendering inconsistencies**: Different browsers might render content differently. Solution: Adjust the viewport settings or CSS modifications.
75 | - **Memory issues**: Processing many large PDFs can consume significant memory. Solution: Process in smaller batches or increase available memory.
76 | 
77 | ### PDF Merging Issues
78 | 
79 | - **Bookmark errors**: Incorrect page numbers in bookmarks. Solution: Check the page counting logic in `pdf_merger.py`.
80 | - **Large file size**: The final PDF might be very large. Solution: Adjust the PDF compression settings.
81 | 
82 | ## 🔍 Project Structure
83 | 
84 | - `main.py` - The entry point script
85 | - `url_discovery.py` - Discovers HIG article URLs
86 | - `pdf_generator.py` - Converts articles to PDFs
87 | - `pdf_merger.py` - Merges PDFs with bookmarks
88 | - `utils.py` - Utility functions for the project
89 | 
90 | ## 📝 License
91 | 
92 | This project is for personal use only. The Apple Human Interface Guidelines content is copyrighted by Apple Inc.
93 | 
94 | ## 🙏 Acknowledgements
95 | 
96 | This tool is not affiliated with, authorized, maintained, sponsored, or endorsed by Apple Inc. or any of its affiliates or subsidiaries.
97 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import time
  4 | import hashlib
  5 | from datetime import datetime
  6 | from PyPDF2 import PdfReader
  7 | 
  8 | def sanitize_filename(text):
  9 |     """Create safe filenames from titles"""
 10 |     return re.sub(r'[\\/*?:"<>|]', '', text)[:100].strip()
 11 | 
 12 | def get_unique_filename(output_dir, base_name):
 13 |     """Create a unique filename using timestamp and hash if needed."""
 14 |     base, ext = os.path.splitext(base_name)
 15 |     timestamp = int(time.time() * 1000)
 16 |     counter = 0
 17 |     
 18 |     while True:
 19 |         if counter == 0:
 20 |             filename = f"{base}_{timestamp}{ext}"
 21 |         else:
 22 |             filename = f"{base}_{timestamp}_{counter}{ext}"
 23 |             
 24 |         filepath = os.path.join(output_dir, filename)
 25 |         if not os.path.exists(filepath):
 26 |             return filepath
 27 |         counter += 1
 28 | 
 29 | def calculate_content_hash(page):
 30 |     """Calculate hash of page content for duplicate detection"""
 31 |     content = page.evaluate("""() => {
 32 |         const main = document.querySelector('main') || document.body;
 33 |         const clone = main.cloneNode(true);
 34 |         const dynamics = clone.querySelectorAll('[data-dynamic], .timestamp, time');
 35 |         dynamics.forEach(el => el.remove());
 36 |         return clone.textContent;
 37 |     }""")
 38 |     return hashlib.md5(content.encode()).hexdigest()
 39 | 
 40 | def get_pdf_page_count(pdf_path):
 41 |     """Get the number of pages in a PDF file"""
 42 |     try:
 43 |         with open(pdf_path, 'rb') as file:
 44 |             reader = PdfReader(file)
 45 |             return len(reader.pages)
 46 |     except Exception as e:
 47 |         print(f"Error getting page count for {pdf_path}: {str(e)}")
 48 |         return 0
 49 | 
 50 | def create_index_html(sections_info):
 51 |     """Create index page HTML with Apple-style design"""
 52 |     items = '\n'.join([
 53 |         f'<li><a href="#{idx}"><span class="title">{title}</span>'
 54 |         f'<span class="page">{page}</span></a></li>'
 55 |         for idx, (title, page) in enumerate(sections_info, 1)
 56 |     ])
 57 |     
 58 |     return f"""
 59 |         <html>
 60 |         <head>
 61 |             <style>
 62 |                 body {{
 63 |                     font-family: -apple-system, BlinkMacSystemFont, "SF Pro Text", "SF Pro Icons", sans-serif;
 64 |                     padding: 48px;
 65 |                     max-width: 980px;
 66 |                     margin: 0 auto;
 67 |                     color: #1d1d1f;
 68 |                 }}
 69 |                 h1 {{
 70 |                     font-size: 40px;
 71 |                     font-weight: 600;
 72 |                     letter-spacing: -0.003em;
 73 |                     margin-bottom: 40px;
 74 |                 }}
 75 |                 ul {{
 76 |                     list-style: none;
 77 |                     padding: 0;
 78 |                     margin: 0;
 79 |                     border-top: 1px solid #d2d2d7;
 80 |                 }}
 81 |                 li {{
 82 |                     border-bottom: 1px solid #d2d2d7;
 83 |                 }}
 84 |                 a {{
 85 |                     text-decoration: none;
 86 |                     color: inherit;
 87 |                     padding: 12px 0;
 88 |                     display: flex;
 89 |                     justify-content: space-between;
 90 |                     align-items: center;
 91 |                 }}
 92 |                 a:hover {{
 93 |                     color: #06c;
 94 |                 }}
 95 |                 .title {{
 96 |                     font-size: 17px;
 97 |                     letter-spacing: -0.022em;
 98 |                 }}
 99 |                 .page {{
100 |                     color: #86868b;
101 |                     font-size: 15px;
102 |                     font-weight: 400;
103 |                 }}
104 |             </style>
105 |         </head>
106 |         <body>
107 |             <h1>Contents</h1>
108 |             <ul>{items}</ul>
109 |         </body>
110 |         </html>
111 |     """
112 | 
113 | def create_cover_html():
114 |     """Create a minimalist cover page"""
115 |     return f"""
116 |         <!DOCTYPE html>
117 |         <html>
118 |         <head>
119 |             <style>
120 |                 body {{
121 |                     margin: 0;
122 |                     padding: 40px;
123 |                     display: flex;
124 |                     justify-content: center;
125 |                     align-items: center;
126 |                     min-height: 100vh;
127 |                     font-family: -apple-system, BlinkMacSystemFont, sans-serif;
128 |                 }}
129 |                 .cover {{ text-align: center; max-width: 800px; }}
130 |                 h1 {{
131 |                     font-size: 48px;
132 |                     font-weight: 500;
133 |                     margin: 0 0 2rem;
134 |                     color: #1d1d1f;
135 |                 }}
136 |                 .subtitle {{
137 |                     font-size: 24px;
138 |                     font-weight: 300;
139 |                     color: #86868b;
140 |                     margin: 0;
141 |                 }}
142 |             </style>
143 |         </head>
144 |         <body>
145 |             <div class="cover">
146 |                 <h1>Human Interface Guidelines</h1>
147 |                 <p class="subtitle">A comprehensive guide for designing<br>intuitive user experiences</p>
148 |             </div>
149 |         </body>
150 |         </html>
151 |     """
152 | 


--------------------------------------------------------------------------------
/src/pdf_generator.py:
--------------------------------------------------------------------------------
  1 | from playwright.sync_api import sync_playwright
  2 | import urllib.parse
  3 | import os
  4 | from src.utils import (
  5 |     sanitize_filename, get_unique_filename, calculate_content_hash, 
  6 |     create_index_html, get_pdf_page_count, create_cover_html
  7 | )
  8 | 
  9 | def add_page_break_script():
 10 |     """JavaScript to handle image pagination and section breaks"""
 11 |     return """
 12 |         // Handle images and their captions
 13 |         function wrapImageWithCaption(img) {
 14 |             const wrapper = document.createElement('div');
 15 |             wrapper.style.pageBreakInside = 'avoid';
 16 |             wrapper.style.breakInside = 'avoid';
 17 |             wrapper.style.margin = '1em 0';
 18 |             wrapper.style.display = 'flex';
 19 |             wrapper.style.flexDirection = 'column';
 20 | 
 21 |             // Find related caption
 22 |             let caption = img.closest('figure')?.querySelector('figcaption') ||
 23 |                          (img.nextElementSibling?.matches('.caption, [class*="caption"], p[class*="caption"]') 
 24 |                             ? img.nextElementSibling : null) ||
 25 |                          img.closest('dt')?.nextElementSibling;
 26 | 
 27 |             // Get the container that holds both image and caption
 28 |             const container = img.closest('figure') || img.parentElement;
 29 |             
 30 |             if (container) {
 31 |                 container.style.pageBreakInside = 'avoid';
 32 |                 container.style.breakInside = 'avoid';
 33 |                 
 34 |                 if (!container.parentElement?.hasAttribute('data-image-wrapper')) {
 35 |                     wrapper.setAttribute('data-image-wrapper', 'true');
 36 |                     container.parentNode.insertBefore(wrapper, container);
 37 |                     wrapper.appendChild(container);
 38 |                 }
 39 |             } else {
 40 |                 wrapper.setAttribute('data-image-wrapper', 'true');
 41 |                 img.parentNode.insertBefore(wrapper, img);
 42 |                 wrapper.appendChild(img);
 43 |                 if (caption) wrapper.appendChild(caption);
 44 |             }
 45 |         }
 46 | 
 47 |         // Handle Resources and Change Log sections
 48 |         function handleSpecialSections() {
 49 |             const headings = Array.from(document.querySelectorAll('h1, h2, h3, h4, h5, h6'));
 50 |             
 51 |             const resourcesHeading = headings.find(h => 
 52 |                 h.textContent.toLowerCase().includes('resource') ||
 53 |                 h.textContent.toLowerCase().includes('related') ||
 54 |                 h.textContent.toLowerCase().includes('see also')
 55 |             );
 56 | 
 57 |             if (resourcesHeading) {
 58 |                 // Create resources wrapper with page break
 59 |                 const wrapper = document.createElement('div');
 60 |                 wrapper.style.pageBreakBefore = 'always';
 61 |                 wrapper.style.breakBefore = 'page';
 62 |                 
 63 |                 // Get all content until next major heading or end
 64 |                 const content = [];
 65 |                 let current = resourcesHeading;
 66 |                 
 67 |                 while (current) {
 68 |                     const next = current.nextElementSibling;
 69 |                     if (next && next.matches('h1')) break;
 70 |                     content.push(current);
 71 |                     current = next;
 72 |                 }
 73 | 
 74 |                 // Move content to wrapper
 75 |                 resourcesHeading.parentNode.insertBefore(wrapper, resourcesHeading);
 76 |                 content.forEach(node => wrapper.appendChild(node));
 77 |             }
 78 |         }
 79 | 
 80 |         // Process images first
 81 |         document.querySelectorAll('img, [role="img"], svg').forEach(wrapImageWithCaption);
 82 |         document.querySelectorAll('.graphics-container, [class*="figure"], [class*="image"]').forEach(container => {
 83 |             container.style.pageBreakInside = 'avoid';
 84 |             container.style.breakInside = 'avoid';
 85 |         });
 86 | 
 87 |         // Then handle special sections
 88 |         handleSpecialSections();
 89 |     """
 90 | 
 91 | def generate_pdfs(article_urls):
 92 |     """Generate PDFs for all articles"""
 93 |     output_dir = 'Apple-HIGs'
 94 |     os.makedirs(output_dir, exist_ok=True)
 95 |     generated_files = []
 96 |     titles = []
 97 |     page_numbers = []
 98 |     content_hashes = set()
 99 |     current_page = 1
100 |     
101 |     with sync_playwright() as p:
102 |         browser = p.chromium.launch()
103 |         context = browser.new_context(
104 |             viewport={'width': 1200, 'height': 800},
105 |             forced_colors='none'
106 |         )
107 |         
108 |         # Generate cover page
109 |         cover_page = context.new_page()
110 |         cover_html = create_cover_html()
111 |         cover_page.set_content(cover_html)
112 |         cover_file = os.path.join(output_dir, "_cover.pdf")
113 |         cover_page.pdf(
114 |             path=cover_file,
115 |             format='A4',
116 |             print_background=True,
117 |             margin={'top': '1cm', 'bottom': '1cm', 'left': '1cm', 'right': '1cm'},
118 |         )
119 |         cover_page.close()
120 |         
121 |         # Store cover file but don't add to generated_files yet
122 |         current_page += get_pdf_page_count(cover_file)
123 |         
124 |         # Generate PDFs for articles
125 |         for idx, url in enumerate(article_urls, 1):
126 |             try:
127 |                 page = context.new_page()
128 |                 page.goto(url, wait_until='networkidle', timeout=60000)
129 |                 
130 |                 content_hash = calculate_content_hash(page)
131 |                 if content_hash in content_hashes:
132 |                     print(f"Skipping duplicate content: {url}")
133 |                     continue
134 |                     
135 |                 content_hashes.add(content_hash)
136 |                 
137 |                 try:
138 |                     page.evaluate(add_page_break_script())
139 |                 except Exception as e:
140 |                     print(f"Warning: Could not apply image pagination for {url}: {str(e)}")
141 |                 
142 |                 try:
143 |                     page.wait_for_selector('img', state='attached', timeout=5000)
144 |                 except:
145 |                     print(f"Warning: No images found or timeout waiting for images in {url}")
146 |                 
147 |                 title_element = page.query_selector('h1')
148 |                 title = title_element.inner_text() if title_element else 'Untitled'
149 |                 titles.append(title)
150 |                 
151 |                 path_parts = [p for p in urllib.parse.urlparse(url).path.split('/') if p]
152 |                 section = path_parts[-2] if len(path_parts) > 1 else 'misc'
153 |                 safe_title = sanitize_filename(f"{section}-{title}")
154 |                 filepath = get_unique_filename(output_dir, f"{safe_title}.pdf")
155 |                 
156 |                 pdf_options = {
157 |                     'path': filepath,
158 |                     'format': 'A4',
159 |                     'print_background': True,
160 |                     'margin': {'top': '1cm', 'bottom': '1cm', 'left': '1cm', 'right': '1cm'},
161 |                     'display_header_footer': False
162 |                 }
163 |                 
164 |                 page.pdf(**pdf_options)
165 |                 
166 |                 page_count = get_pdf_page_count(filepath)
167 |                 page_numbers.append(current_page)
168 |                 current_page += page_count
169 |                 
170 |                 generated_files.append(filepath)
171 |                 print(f'Generated ({idx}/{len(article_urls)}): {os.path.basename(filepath)} - {page_count} pages')
172 |                 
173 |             except Exception as e:
174 |                 print(f'Failed {url}: {str(e)}')
175 |             finally:
176 |                 page.close()
177 |         
178 |         # Create index
179 |         sections_info = list(zip(titles, page_numbers))
180 |         index_html = create_index_html(sections_info)
181 |         index_file = os.path.join(output_dir, "_index.pdf")
182 |         
183 |         index_page = context.new_page()
184 |         index_page.set_content(index_html)
185 |         
186 |         pdf_options = {
187 |             'path': index_file,
188 |             'format': 'A4',
189 |             'print_background': True,
190 |             'margin': {'top': '1cm', 'bottom': '1cm', 'left': '1cm', 'right': '1cm'}
191 |         }
192 |         
193 |         index_page.pdf(**pdf_options)
194 |         index_page.close()
195 |         
196 |         # Add files in the correct order: cover, index, content
197 |         generated_files = [cover_file, index_file] + generated_files
198 |         browser.close()
199 |         
200 |         # Return tuple directly instead of list of tuples
201 |         return (output_dir, generated_files, sections_info)
202 | 


--------------------------------------------------------------------------------