├── .github
    └── workflows
    │   └── release.yml
├── .gitignore
├── LICENSE
├── Package.swift
├── README.md
├── Sources
    └── ocr.swift
├── images
    ├── handwriting.json
    ├── handwriting.webp
    ├── handwriting_boxes.png
    └── macos-vision-ocr.jpg
└── output
    ├── handwriting.json
    ├── handwriting.webp.json
    ├── macos-vision-ocr.jpg.json
    └── merged_output.txt


/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release Build
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       platform:
 7 |         description: 'Build Platform'
 8 |         required: true
 9 |         default: 'all'
10 |         type: choice
11 |         options:
12 |           - all
13 |           - arm64
14 |           - x86_64
15 |       version:
16 |         description: 'Version (e.g. v1.0.0)'
17 |         required: true
18 |         type: string
19 |         default: 'v1.0.0'
20 | 
21 | jobs:
22 |   build-and-release:
23 |     runs-on: macos-latest
24 |     permissions:
25 |       contents: write
26 |     steps:
27 |       - uses: actions/checkout@v3
28 |       
29 |       - name: Set up Swift
30 |         uses: swift-actions/setup-swift@v1
31 |         with:
32 |           swift-version: "5.9"
33 | 
34 |       - name: Build for arm64
35 |         if: github.event.inputs.platform == 'arm64' || github.event.inputs.platform == 'all'
36 |         run: |
37 |           swift build -c release --arch arm64
38 |           mv .build/release/macos-vision-ocr .build/release/macos-vision-ocr-arm64
39 |           zip -j macos-vision-ocr-arm64-${{ github.event.inputs.version }}.zip .build/release/macos-vision-ocr-arm64
40 | 
41 |       - name: Build for x86_64
42 |         if: github.event.inputs.platform == 'x86_64' || github.event.inputs.platform == 'all'
43 |         run: |
44 |           swift build -c release --arch x86_64
45 |           mv .build/release/macos-vision-ocr .build/release/macos-vision-ocr-x86_64
46 |           zip -j macos-vision-ocr-x86_64-${{ github.event.inputs.version }}.zip .build/release/macos-vision-ocr-x86_64
47 | 
48 |       - name: Create Release
49 |         uses: softprops/action-gh-release@v1
50 |         with:
51 |           tag_name: ${{ github.event.inputs.version }}
52 |           name: Release ${{ github.event.inputs.version }}
53 |           draft: false
54 |           prerelease: false
55 |           files: |
56 |             macos-vision-ocr-*-${{ github.event.inputs.version }}.zip


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | node_modules
 2 | *.log
 3 | dist
 4 | .cache
 5 | playground
 6 | .idea
 7 | .DS_Store
 8 | .eslintcache
 9 | .build
10 | .vscode
11 | Package.resolved


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright © 2021 EGOIST (https://github.com/sponsors/egoist)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 | 


--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version: 5.9
 2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
 3 | 
 4 | import PackageDescription
 5 | 
 6 | let package = Package(
 7 |     name: "macos-vision-ocr",
 8 |      platforms: [
 9 |        .macOS(.v10_15)
10 |      ],
11 |     dependencies: [
12 |       .package(url: "https://github.com/apple/swift-argument-parser", exact: "1.2.3")
13 |     ],
14 |     targets: [
15 |         // Targets are the basic building blocks of a package, defining a module or a test suite.
16 |         // Targets can depend on other targets in this package and products from dependencies.
17 |         .executableTarget(
18 |             name: "macos-vision-ocr",
19 |             dependencies: [
20 |                .product(name: "ArgumentParser", package: "swift-argument-parser")
21 |             ]
22 |         ),
23 |     ]
24 | )
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MacOS Vision OCR
  2 | 
  3 | A powerful command-line OCR tool built with Apple's Vision framework, supporting single image and batch processing with detailed positional information output.
  4 | 
  5 | ## Features
  6 | 
  7 | - Support for multiple image formats (JPG, JPEG, PNG, WEBP)
  8 | - Single image and batch processing modes
  9 | - Multi-language recognition (supporting 16 languages including English, Chinese, Japanese, Korean, and European languages)
 10 | - Detailed JSON output with text positions and confidence scores
 11 | - Debug mode with visual bounding boxes
 12 | - Support for both arm64 and x86_64 architectures
 13 | 
 14 | ## System Requirements
 15 | 
 16 | - macOS 10.15 or later
 17 | - Support for arm64 (Apple Silicon) or x86_64 (Intel) architecture
 18 | 
 19 | > It is recommended that macOS 13 or later be used in preference to macOS 13 or later for the best OCR recognition.
 20 | 
 21 | ## Installation
 22 | 
 23 | ### Build from Source
 24 | 
 25 | 1. Ensure Xcode and Command Line Tools are installed
 26 | 
 27 | 2. Clone the repository:
 28 | 
 29 | ```bash
 30 | git clone https://github.com/your-username/macos-vision-ocr.git
 31 | cd macos-vision-ocr
 32 | ```
 33 | 
 34 | 3. Build for your architecture:
 35 | 
 36 | For Apple Silicon (arm64):
 37 | 
 38 | ```bash
 39 | swift build -c release --arch arm64
 40 | ```
 41 | 
 42 | For Intel (x86_64):
 43 | 
 44 | ```bash
 45 | swift build -c release --arch x86_64
 46 | ```
 47 | 
 48 | ## Usage
 49 | 
 50 | ### Single Image Processing
 51 | 
 52 | Process a single image and output to console:
 53 | 
 54 | ```bash
 55 | ./macos-vision-ocr --img ./images/handwriting.webp
 56 | ```
 57 | 
 58 | Process with custom output directory:
 59 | 
 60 | ```bash
 61 | ./macos-vision-ocr --img ./images/handwriting.webp --output ./images
 62 | ```
 63 | 
 64 | ### Set Recognition Languages
 65 | 
 66 | Recognition languages can be specified using the `--rec-langs` option. For example:
 67 | 
 68 | ```bash
 69 | ./macos-vision-ocr --img ./images/handwriting.webp --rec-langs "zh-Hans, zh-Hant, en-US"
 70 | ```
 71 | 
 72 | ### Batch Processing
 73 | 
 74 | Process multiple images in a directory:
 75 | 
 76 | ```bash
 77 | ./macos-vision-ocr --img-dir ./images --output-dir ./output
 78 | ```
 79 | 
 80 | Merge all results into a single file:
 81 | 
 82 | ```bash
 83 | ./macos-vision-ocr --img-dir ./images --output-dir ./output --merge
 84 | ```
 85 | 
 86 | ### Debug Mode
 87 | 
 88 | Enable debug mode to visualize text detection:
 89 | 
 90 | ```bash
 91 | ./macos-vision-ocr --img ./images/handwriting.webp --debug
 92 | ```
 93 | 
 94 | ![handwriting_boxes.png](./images/handwriting_boxes.png)
 95 | 
 96 | ### Command Line Options
 97 | 
 98 | ```
 99 | Options:
100 |   --img <path>          Path to a single image file
101 |   --output <path>       Output directory for single image mode
102 |   --img-dir <path>      Directory containing images for batch mode
103 |   --output-dir <path>   Output directory for batch mode
104 |   --merge              Merge all text outputs into a single file in batch mode
105 |   --debug              Debug mode: Draw bounding boxes on the image
106 |   --lang               Show supported recognition languages
107 |   --help               Show help information
108 | ```
109 | 
110 | ## Output Format
111 | 
112 | The tool outputs JSON with the following structure:
113 | 
114 | ```json
115 | {
116 |   "texts": "The Llama 3.2-Vision Collection of multimodal large langyage model5 （LLMS） is a\ncollection of instruction-tuned image reasoning generative models in l1B and 90B\nsizes （text + images in / text ovt）. The Llama 3.2-Vision instruction-tuned models\nare optimized for visval recognittion, iage reasoning, captioning, and answering\ngeneral qvestions about an iage. The models outperform many of the available\nopen Source and Closed multimodal models on common industry benchmarKs.",
117 |   "info": {
118 |     "filepath": "./images/handwriting.webp",
119 |     "width": 1600,
120 |     "filename": "handwriting.webp",
121 |     "height": 720
122 |   },
123 |   "observations": [
124 |     {
125 |       "text": "The Llama 3.2-Vision Collection of multimodal large langyage model5 （LLMS） is a",
126 |       "confidence": 0.5,
127 |       "quad": {
128 |         "topLeft": {
129 |           "y": 0.28333333395755611,
130 |           "x": 0.09011629800287288
131 |         },
132 |         "topRight": {
133 |           "x": 0.87936045388666206,
134 |           "y": 0.28333333395755611
135 |         },
136 |         "bottomLeft": {
137 |           "x": 0.09011629800287288,
138 |           "y": 0.35483871098527953
139 |         },
140 |         "bottomRight": {
141 |           "x": 0.87936045388666206,
142 |           "y": 0.35483871098527953
143 |         }
144 |       }
145 |     }
146 |   ]
147 | }
148 | ```
149 | 
150 | ## Debug Output
151 | 
152 | When using `--debug`, the tool will:
153 | 
154 | 1. Create a new image with "\_boxes.png" suffix
155 | 2. Draw red bounding boxes around detected text
156 | 3. Save the debug image in the same directory as the input image
157 | 
158 | ## Supported Languages
159 | 
160 | - English (en-US)
161 | - French (fr-FR)
162 | - Italian (it-IT)
163 | - German (de-DE)
164 | - Spanish (es-ES)
165 | - Portuguese (Brazil) (pt-BR)
166 | - Simplified Chinese (zh-Hans)
167 | - Traditional Chinese (zh-Hant)
168 | - Simplified Cantonese (yue-Hans)
169 | - Traditional Cantonese (yue-Hant)
170 | - Korean (ko-KR)
171 | - Japanese (ja-JP)
172 | - Russian (ru-RU)
173 | - Ukrainian (uk-UA)
174 | - Thai (th-TH)
175 | - Vietnamese (vi-VT)
176 | 
177 | ## Node.js Integration Example
178 | 
179 | Here's an example of how to use `macos-vision-ocr` in a Node.js application:
180 | 
181 | ```javascript
182 | const { exec } = require("child_process");
183 | const util = require("util");
184 | const execPromise = util.promisify(exec);
185 | 
186 | async function performOCR(imagePath, outputDir = null) {
187 |   try {
188 |     // Construct the command
189 |     let command = `./macos-vision-ocr --img "${imagePath}"`;
190 |     if (outputDir) {
191 |       command += ` --output "${outputDir}"`;
192 |     }
193 | 
194 |     // Execute the OCR command
195 |     const { stdout, stderr } = await execPromise(command);
196 | 
197 |     if (stderr) {
198 |       console.error("Error:", stderr);
199 |       return null;
200 |     }
201 | 
202 |     // Parse the JSON output
203 |     console.log("stdout:", stdout);
204 |     const result = JSON.parse(stdout);
205 |     return result;
206 |   } catch (error) {
207 |     console.error("OCR processing failed:", error);
208 |     return null;
209 |   }
210 | }
211 | 
212 | // Example usage
213 | async function example() {
214 |   const result = await performOCR("./images/handwriting.webp");
215 |   if (result) {
216 |     console.log("Extracted text:", result.texts);
217 |     console.log("Text positions:", result.observations);
218 |   }
219 | }
220 | 
221 | example();
222 | ```
223 | 
224 | ## Common Issues
225 | 
226 | 1. **Image Loading Fails**
227 | 
228 |    - Ensure the image path is correct
229 |    - Verify the image format is supported (JPG, JPEG, PNG, WEBP)
230 |    - Check file permissions
231 | 
232 | 2. **No Text Detected**
233 |    - Ensure the image contains clear, readable text
234 |    - Check if the text size is not too small (minimum text height is 1% of image height)
235 |    - Verify the text language is supported
236 | 
237 | ## License
238 | 
239 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
240 | 
241 | ## Acknowledgments
242 | 
243 | Built with:
244 | 
245 | - Apple Vision Framework
246 | - Swift Argument Parser
247 | - macOS Native APIs
248 | 


--------------------------------------------------------------------------------
/Sources/ocr.swift:
--------------------------------------------------------------------------------
  1 | import Cocoa
  2 | import Vision
  3 | import ArgumentParser
  4 | import Foundation
  5 | 
  6 | @main
  7 | struct MacOSVisionOCR: ParsableCommand {
  8 |     static var configuration = CommandConfiguration(
  9 |         commandName: "macos-vision-ocr",
 10 |         abstract: "Perform OCR on single image or batch of images"
 11 |     )
 12 | 
 13 |     @Option(name: .long, help: "Path to a single image file")
 14 |     var img: String?
 15 | 
 16 |     @Option(name: .long, help: "Output directory for single image mode")
 17 |     var output: String?
 18 | 
 19 |     @Option(name: .long, help: "Directory containing images for batch mode")
 20 |     var imgDir: String?
 21 | 
 22 |     @Option(name: .long, help: "Output directory for batch mode")
 23 |     var outputDir: String?
 24 | 
 25 |     @Flag(name: .long, help: "Merge all text outputs into a single file in batch mode")
 26 |     var merge = false
 27 | 
 28 |     @Flag(name: .long, help: "Debug mode: Draw bounding boxes on the image")
 29 |     var debug = false
 30 | 
 31 |     @Flag(name: .long, help: "Show supported recognition languages")
 32 |     var lang = false
 33 | 
 34 |     @Option(name: .long, help: "Recognition languages")
 35 |     var recLangs: String?
 36 | 
 37 |     var revision: Int {
 38 |         var REVISION: Int
 39 |         if #available(macOS 13, *) {
 40 |             REVISION = VNRecognizeTextRequestRevision3
 41 |         } else if #available(macOS 11, *) {
 42 |             REVISION = VNRecognizeTextRequestRevision2
 43 |         } else {
 44 |             REVISION = VNRecognizeAnimalsRequestRevision1
 45 |         }
 46 |         return REVISION
 47 |     }
 48 |     
 49 |     private func isEmptyBox(_ box: VNRectangleObservation) -> Bool {
 50 |         let width = box.topRight.x - box.topLeft.x
 51 |         let height = box.topLeft.y - box.bottomLeft.y
 52 |         return width * height == 0
 53 |     }
 54 |     
 55 |     private func extractSubBounds(imageRef: CGImage, observation: VNRecognizedTextObservation, recognizedText: VNRecognizedText, positionalJson: inout [[String: Any]]) {
 56 |         func normalizeCoordinate(_ value: CGFloat) -> CGFloat {
 57 |             return max(0, min(1, value))
 58 |         }
 59 | 
 60 |         let text = recognizedText.string
 61 |         let topLeft = observation.topLeft
 62 |         let topRight = observation.topRight
 63 |         let bottomRight = observation.bottomRight
 64 |         let bottomLeft = observation.bottomLeft
 65 | 
 66 |         let quad: [String: Any] = [
 67 |             "topLeft": [
 68 |                 "x": normalizeCoordinate(topLeft.x),
 69 |                 "y": normalizeCoordinate(1 - topLeft.y)
 70 |             ],
 71 |             "topRight": [
 72 |                 "x": normalizeCoordinate(topRight.x),
 73 |                 "y": normalizeCoordinate(1 - topRight.y)
 74 |             ],
 75 |             "bottomRight": [
 76 |                 "x": normalizeCoordinate(bottomRight.x),
 77 |                 "y": normalizeCoordinate(1 - bottomRight.y)
 78 |             ],
 79 |             "bottomLeft": [
 80 |                 "x": normalizeCoordinate(bottomLeft.x),
 81 |                 "y": normalizeCoordinate(1 - bottomLeft.y)
 82 |             ]
 83 |         ]
 84 | 
 85 |         positionalJson.append([
 86 |             "text": text,
 87 |             "confidence": observation.confidence,
 88 |             "quad": quad
 89 |         ])
 90 |     }
 91 |     
 92 |     private func getSupportedLanguages() -> [String] {
 93 |         if #available(macOS 13, *) {
 94 |             let request = VNRecognizeTextRequest()
 95 |             do {
 96 |                 return try request.supportedRecognitionLanguages()
 97 |             } catch {
 98 |                 return ["zh-Hans", "zh-Hant", "en-US", "ja-JP"]
 99 |             }
100 |         } else {
101 |             return ["zh-Hans", "zh-Hant", "en-US", "ja-JP"]
102 |         }
103 |     }
104 |     
105 |     mutating func run() throws {
106 |         if lang {
107 |             let languages = getSupportedLanguages()
108 |             print("Supported recognition languages:")
109 |             languages.forEach { print("- \($0)") }
110 |             return
111 |         }
112 | 
113 |         if let img = img {
114 |             try processSingleImage(img, outputDir: output)
115 |         } else if let imgDir = imgDir {
116 |             try processBatchImages(imgDir, outputDir: outputDir)
117 |         } else {
118 |             throw ValidationError("Either --img or --img-dir must be provided")
119 |         }
120 |     }
121 | 
122 |     private func processSingleImage(_ imagePath: String, outputDir: String?) throws {
123 |         let jsonResult = try extractText(from: imagePath)
124 |         
125 |         if let outputDir = outputDir {
126 |             let fileManager = FileManager.default
127 |             if !fileManager.fileExists(atPath: outputDir) {
128 |                 try fileManager.createDirectory(atPath: outputDir, withIntermediateDirectories: true, attributes: nil)
129 |             }
130 |             let inputFileName = (imagePath as NSString).lastPathComponent
131 |             let outputFileName = (inputFileName as NSString).deletingPathExtension + ".json"
132 |             let outputPath = (outputDir as NSString).appendingPathComponent(outputFileName)
133 |             try jsonResult.write(toFile: outputPath, atomically: true, encoding: .utf8)
134 |             print("OCR result saved to: \(outputPath)")
135 |         } else {
136 |             print(jsonResult)
137 |         }
138 | 
139 |         if debug {
140 |             try drawDebugImage(imagePath: imagePath, jsonResult: jsonResult)
141 |         }
142 |     }
143 | 
144 |     private func processBatchImages(_ imgDir: String, outputDir: String?) throws {
145 |         let fileManager = FileManager.default
146 |         
147 |         if let outputDir = outputDir {
148 |             if !fileManager.fileExists(atPath: outputDir) {
149 |                 try fileManager.createDirectory(atPath: outputDir, withIntermediateDirectories: true, attributes: nil)
150 |             }
151 |         }
152 | 
153 |         let enumerator = fileManager.enumerator(atPath: imgDir)
154 |         var imageFiles: [String] = []
155 | 
156 |         while let filePath = enumerator?.nextObject() as? String {
157 |             if isImageFile(filePath) {
158 |                 imageFiles.append(filePath)
159 |             }
160 |         }
161 | 
162 |         imageFiles.sort()
163 |         var mergedText = ""
164 | 
165 |         for imagePath in imageFiles {
166 |             let fullImagePath = (imgDir as NSString).appendingPathComponent(imagePath)
167 |             let jsonResult = try extractText(from: fullImagePath)
168 |             
169 |             if let outputDir = outputDir {
170 |                 let outputPath = (outputDir as NSString).appendingPathComponent((imagePath as NSString).lastPathComponent + ".json")
171 |                 try jsonResult.write(toFile: outputPath, atomically: true, encoding: .utf8)
172 |             }
173 | 
174 |             if merge {
175 |                 if let data = jsonResult.data(using: .utf8),
176 |                    let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
177 |                    let text = json["texts"] as? String {
178 |                     mergedText += text + "\n\n"
179 |                 }
180 |             }
181 | 
182 |             if debug {
183 |                 try drawDebugImage(imagePath: fullImagePath, jsonResult: jsonResult)
184 |             }
185 |         }
186 | 
187 |         if merge, let outputDir = outputDir {
188 |             let mergedPath = (outputDir as NSString).appendingPathComponent("merged_output.txt")
189 |             try mergedText.write(toFile: mergedPath, atomically: true, encoding: .utf8)
190 |         }
191 |     }
192 | 
193 |     private func isImageFile(_ filePath: String) -> Bool {
194 |         let imageExtensions = ["jpg", "jpeg", "png", "webp"]
195 |         return imageExtensions.contains((filePath as NSString).pathExtension.lowercased())
196 |     }
197 | 
198 |     private func extractText(from imagePath: String) throws -> String {
199 |         guard let img = NSImage(byReferencingFile: imagePath) else {
200 |             throw OCRError.imageLoadFailed(path: imagePath)
201 |         }
202 |         
203 |         guard let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else {
204 |             throw OCRError.imageConversionFailed(path: imagePath)
205 |         }
206 | 
207 |         let request = VNRecognizeTextRequest()
208 |         request.recognitionLevel = .accurate
209 |         request.usesLanguageCorrection = true
210 |         
211 |         // Use recLangs if provided, otherwise use supported languages
212 |         if let recLangs = recLangs {
213 |             let languages = recLangs
214 |                 .components(separatedBy: ",")
215 |                 .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
216 |                 .filter { !$0.isEmpty }
217 |             request.recognitionLanguages = languages
218 |         } else {
219 |             request.recognitionLanguages = getSupportedLanguages()
220 |         }
221 |         
222 |         request.revision = revision
223 |         
224 |         request.minimumTextHeight = 0.01
225 | 
226 |         let handler = VNImageRequestHandler(cgImage: cgImage, options: [:])
227 |         try handler.perform([request])
228 | 
229 |         guard let observations = request.results else {
230 |             throw OCRError.noTextFound
231 |         }
232 | 
233 |         var positionalJson: [[String: Any]] = []
234 |         var fullText: [String] = []
235 | 
236 |         for observation in observations {
237 |             guard let candidate = observation.topCandidates(1).first else { continue }
238 |             fullText.append(candidate.string)
239 |             extractSubBounds(imageRef: cgImage, observation: observation, recognizedText: candidate, positionalJson: &positionalJson)
240 |         }
241 | 
242 |         let combinedFullText = fullText.joined(separator: "\n")
243 |         
244 |         let fileManager = FileManager.default
245 |         let absolutePath = (fileManager.currentDirectoryPath as NSString).appendingPathComponent(imagePath)
246 |         
247 |         let info: [String: Any] = [
248 |             "filename": (imagePath as NSString).lastPathComponent,
249 |             "filepath": absolutePath,
250 |             "width": cgImage.width,
251 |             "height": cgImage.height
252 |         ]
253 |         
254 |         let result: [String: Any] = [
255 |             "info": info,
256 |             "observations": positionalJson,
257 |             "texts": combinedFullText
258 |         ]
259 | 
260 |         let jsonData = try JSONSerialization.data(withJSONObject: result, options: .prettyPrinted)
261 |         return String(data: jsonData, encoding: .utf8) ?? ""
262 |     }
263 | 
264 |     private func drawDebugImage(imagePath: String, jsonResult: String) throws {
265 |         guard let image = NSImage(contentsOfFile: imagePath) else {
266 |             throw OCRError.imageLoadFailed(path: imagePath)
267 |         }
268 |         
269 |         let size = image.size
270 |         let imageRect = CGRect(origin: .zero, size: size)
271 |         
272 |         let newImage = NSImage(size: size)
273 |         newImage.lockFocus()
274 |         
275 |         // Draw original image
276 |         image.draw(in: imageRect)
277 |         
278 |         // Parse JSON result
279 |         guard let data = jsonResult.data(using: .utf8),
280 |               let json = try JSONSerialization.jsonObject(with: data) as? [String: Any],
281 |               let observations = json["observations"] as? [[String: Any]] else {
282 |             throw OCRError.jsonParsingFailed
283 |         }
284 |         
285 |         // Set up drawing context
286 |         NSColor.red.setStroke()
287 |         let context = NSGraphicsContext.current!.cgContext
288 |         context.setLineWidth(1.0)
289 |         
290 |         // Draw quadrilaterals
291 |         for observation in observations {
292 |             guard let quad = observation["quad"] as? [String: [String: CGFloat]] else { continue }
293 |             
294 |             let topLeft = CGPoint(x: quad["topLeft"]!["x"]! * size.width, y: (1 - quad["topLeft"]!["y"]!) * size.height)
295 |             let topRight = CGPoint(x: quad["topRight"]!["x"]! * size.width, y: (1 - quad["topRight"]!["y"]!) * size.height)
296 |             let bottomRight = CGPoint(x: quad["bottomRight"]!["x"]! * size.width, y: (1 - quad["bottomRight"]!["y"]!) * size.height)
297 |             let bottomLeft = CGPoint(x: quad["bottomLeft"]!["x"]! * size.width, y: (1 - quad["bottomLeft"]!["y"]!) * size.height)
298 |             
299 |             context.beginPath()
300 |             context.move(to: topLeft)
301 |             context.addLine(to: topRight)
302 |             context.addLine(to: bottomRight)
303 |             context.addLine(to: bottomLeft)
304 |             context.closePath()
305 |             context.strokePath()
306 |         }
307 |         
308 |         newImage.unlockFocus()
309 |         
310 |         // Save the new image
311 |         let outputFileName = (imagePath as NSString).deletingPathExtension + "_boxes.png"
312 |         guard let pngData = newImage.tiffRepresentation,
313 |               let bitmap = NSBitmapImageRep(data: pngData),
314 |               let pngData = bitmap.representation(using: .png, properties: [:]) else {
315 |             throw OCRError.imageConversionFailed(path: outputFileName)
316 |         }
317 |         
318 |         try pngData.write(to: URL(fileURLWithPath: outputFileName))
319 |         print("Debug image saved to: \(outputFileName)")
320 |     }
321 | }
322 | 
323 | enum OCRError: Error {
324 |     case imageLoadFailed(path: String)
325 |     case imageConversionFailed(path: String)
326 |     case jsonParsingFailed
327 |     case noTextFound
328 | }
329 | 


--------------------------------------------------------------------------------
/images/handwriting.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "observations" : [
  3 |     {
  4 |       "confidence" : 1,
  5 |       "quad" : {
  6 |         "bottomLeft" : {
  7 |           "x" : 0.09011629800287288,
  8 |           "y" : 0.35483871098527953
  9 |         },
 10 |         "topRight" : {
 11 |           "y" : 0.28333333395755611,
 12 |           "x" : 0.87936045388666206
 13 |         },
 14 |         "bottomRight" : {
 15 |           "x" : 0.87936045388666206,
 16 |           "y" : 0.35483871098527953
 17 |         },
 18 |         "topLeft" : {
 19 |           "x" : 0.09011629800287288,
 20 |           "y" : 0.28333333395755611
 21 |         }
 22 |       },
 23 |       "text" : "The Llama 3.2-Vision collection of multimodal large language models (LLMS) is a"
 24 |     },
 25 |     {
 26 |       "text" : "collection of instruction-tuned image reasoning generative models in lIb and 90B",
 27 |       "confidence" : 1,
 28 |       "quad" : {
 29 |         "bottomLeft" : {
 30 |           "x" : 0.090116268965468371,
 31 |           "y" : 0.43225806298465586
 32 |         },
 33 |         "topRight" : {
 34 |           "y" : 0.36774193719967596,
 35 |           "x" : 0.88372092899964794
 36 |         },
 37 |         "topLeft" : {
 38 |           "x" : 0.090116268965468371,
 39 |           "y" : 0.36774193719967596
 40 |         },
 41 |         "bottomRight" : {
 42 |           "x" : 0.88372092899964794,
 43 |           "y" : 0.43225806298465586
 44 |         }
 45 |       }
 46 |     },
 47 |     {
 48 |       "quad" : {
 49 |         "topRight" : {
 50 |           "y" : 0.44838709880105843,
 51 |           "x" : 0.90988372519304139
 52 |         },
 53 |         "bottomLeft" : {
 54 |           "x" : 0.090116269138353855,
 55 |           "y" : 0.51290322458603832
 56 |         },
 57 |         "topLeft" : {
 58 |           "y" : 0.44838709880105843,
 59 |           "x" : 0.090116269138353855
 60 |         },
 61 |         "bottomRight" : {
 62 |           "y" : 0.51290322458603832,
 63 |           "x" : 0.90988372519304139
 64 |         }
 65 |       },
 66 |       "confidence" : 1,
 67 |       "text" : "sizes (text + images in \/ text out). The Llama 3.2-Vision instruction-tuned models"
 68 |     },
 69 |     {
 70 |       "quad" : {
 71 |         "bottomRight" : {
 72 |           "y" : 0.59722222168345418,
 73 |           "x" : 0.87790696296000648
 74 |         },
 75 |         "topRight" : {
 76 |           "y" : 0.52580645030504303,
 77 |           "x" : 0.87790696296000648
 78 |         },
 79 |         "bottomLeft" : {
 80 |           "x" : 0.090116283028365843,
 81 |           "y" : 0.59722222168345418
 82 |         },
 83 |         "topLeft" : {
 84 |           "x" : 0.090116283028365843,
 85 |           "y" : 0.52580645030504303
 86 |         }
 87 |       },
 88 |       "confidence" : 1,
 89 |       "text" : "are optimized for visual recognition, image reasoning, captioning, and answering"
 90 |     },
 91 |     {
 92 |       "quad" : {
 93 |         "topLeft" : {
 94 |           "y" : 0.61250000048202458,
 95 |           "x" : 0.090116291199582074
 96 |         },
 97 |         "bottomRight" : {
 98 |           "x" : 0.89534883514489449,
 99 |           "y" : 0.67777777825980234
100 |         },
101 |         "bottomLeft" : {
102 |           "y" : 0.67777777825980234,
103 |           "x" : 0.090116291199582074
104 |         },
105 |         "topRight" : {
106 |           "x" : 0.89534883514489449,
107 |           "y" : 0.61250000048202458
108 |         }
109 |       },
110 |       "text" : "general questions about an image. The models outperform many of the available",
111 |       "confidence" : 1
112 |     },
113 |     {
114 |       "text" : "open source and closed multimodal models on common industry benchmarks.",
115 |       "confidence" : 1,
116 |       "quad" : {
117 |         "topRight" : {
118 |           "y" : 0.69677419324689094,
119 |           "x" : 0.82558139034792843
120 |         },
121 |         "bottomLeft" : {
122 |           "x" : 0.090116302457303524,
123 |           "y" : 0.75161290360794786
124 |         },
125 |         "bottomRight" : {
126 |           "x" : 0.82558139034792843,
127 |           "y" : 0.75161290360794786
128 |         },
129 |         "topLeft" : {
130 |           "y" : 0.69677419324689094,
131 |           "x" : 0.090116302457303524
132 |         }
133 |       }
134 |     }
135 |   ],
136 |   "texts" : "The Llama 3.2-Vision collection of multimodal large language models (LLMS) is a\ncollection of instruction-tuned image reasoning generative models in lIb and 90B\nsizes (text + images in \/ text out). The Llama 3.2-Vision instruction-tuned models\nare optimized for visual recognition, image reasoning, captioning, and answering\ngeneral questions about an image. The models outperform many of the available\nopen source and closed multimodal models on common industry benchmarks.",
137 |   "info" : {
138 |     "height" : 720,
139 |     "filename" : "handwriting.webp",
140 |     "filepath" : "\/Users\/abao\/Documents\/Github\/macos-vision-ocr\/.\/images\/handwriting.webp",
141 |     "width" : 1600
142 |   }
143 | }


--------------------------------------------------------------------------------
/images/handwriting.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytefer/macos-vision-ocr/91a236a3193512430df7255e5c84018dc2ecbafc/images/handwriting.webp


--------------------------------------------------------------------------------
/images/handwriting_boxes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytefer/macos-vision-ocr/91a236a3193512430df7255e5c84018dc2ecbafc/images/handwriting_boxes.png


--------------------------------------------------------------------------------
/images/macos-vision-ocr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bytefer/macos-vision-ocr/91a236a3193512430df7255e5c84018dc2ecbafc/images/macos-vision-ocr.jpg


--------------------------------------------------------------------------------
/output/handwriting.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "info" : {
  3 |     "filename" : "handwriting.webp",
  4 |     "width" : 1600,
  5 |     "height" : 720,
  6 |     "filepath" : "\/Users\/abao\/Documents\/Github\/macos-vision-ocr\/.\/images\/handwriting.webp"
  7 |   },
  8 |   "observations" : [
  9 |     {
 10 |       "text" : "The Llama 3.2-Vision collection of multimodal large language models (LLMS) is a",
 11 |       "confidence" : 1,
 12 |       "quad" : {
 13 |         "topLeft" : {
 14 |           "y" : 0.28333333395755611,
 15 |           "x" : 0.09011629800287288
 16 |         },
 17 |         "topRight" : {
 18 |           "x" : 0.87936045388666206,
 19 |           "y" : 0.28333333395755611
 20 |         },
 21 |         "bottomRight" : {
 22 |           "y" : 0.35483871098527953,
 23 |           "x" : 0.87936045388666206
 24 |         },
 25 |         "bottomLeft" : {
 26 |           "x" : 0.09011629800287288,
 27 |           "y" : 0.35483871098527953
 28 |         }
 29 |       }
 30 |     },
 31 |     {
 32 |       "quad" : {
 33 |         "topRight" : {
 34 |           "x" : 0.88372092899964794,
 35 |           "y" : 0.36774193719967596
 36 |         },
 37 |         "bottomRight" : {
 38 |           "y" : 0.43225806298465586,
 39 |           "x" : 0.88372092899964794
 40 |         },
 41 |         "bottomLeft" : {
 42 |           "x" : 0.090116268965468371,
 43 |           "y" : 0.43225806298465586
 44 |         },
 45 |         "topLeft" : {
 46 |           "y" : 0.36774193719967596,
 47 |           "x" : 0.090116268965468371
 48 |         }
 49 |       },
 50 |       "confidence" : 1,
 51 |       "text" : "collection of instruction-tuned image reasoning generative models in lIb and 90B"
 52 |     },
 53 |     {
 54 |       "text" : "sizes (text + images in \/ text out). The Llama 3.2-Vision instruction-tuned models",
 55 |       "quad" : {
 56 |         "bottomLeft" : {
 57 |           "x" : 0.090116269138353855,
 58 |           "y" : 0.51290322458603832
 59 |         },
 60 |         "topLeft" : {
 61 |           "y" : 0.44838709880105843,
 62 |           "x" : 0.090116269138353855
 63 |         },
 64 |         "topRight" : {
 65 |           "x" : 0.90988372519304139,
 66 |           "y" : 0.44838709880105843
 67 |         },
 68 |         "bottomRight" : {
 69 |           "x" : 0.90988372519304139,
 70 |           "y" : 0.51290322458603832
 71 |         }
 72 |       },
 73 |       "confidence" : 1
 74 |     },
 75 |     {
 76 |       "quad" : {
 77 |         "topLeft" : {
 78 |           "x" : 0.090116283028365843,
 79 |           "y" : 0.52580645030504303
 80 |         },
 81 |         "bottomRight" : {
 82 |           "y" : 0.59722222168345418,
 83 |           "x" : 0.87790696296000648
 84 |         },
 85 |         "bottomLeft" : {
 86 |           "x" : 0.090116283028365843,
 87 |           "y" : 0.59722222168345418
 88 |         },
 89 |         "topRight" : {
 90 |           "x" : 0.87790696296000648,
 91 |           "y" : 0.52580645030504303
 92 |         }
 93 |       },
 94 |       "text" : "are optimized for visual recognition, image reasoning, captioning, and answering",
 95 |       "confidence" : 1
 96 |     },
 97 |     {
 98 |       "text" : "general questions about an image. The models outperform many of the available",
 99 |       "confidence" : 1,
100 |       "quad" : {
101 |         "topRight" : {
102 |           "x" : 0.89534883514489449,
103 |           "y" : 0.61250000048202458
104 |         },
105 |         "bottomLeft" : {
106 |           "y" : 0.67777777825980234,
107 |           "x" : 0.090116291199582074
108 |         },
109 |         "topLeft" : {
110 |           "y" : 0.61250000048202458,
111 |           "x" : 0.090116291199582074
112 |         },
113 |         "bottomRight" : {
114 |           "x" : 0.89534883514489449,
115 |           "y" : 0.67777777825980234
116 |         }
117 |       }
118 |     },
119 |     {
120 |       "text" : "open source and closed multimodal models on common industry benchmarks.",
121 |       "quad" : {
122 |         "bottomRight" : {
123 |           "x" : 0.82558139034792843,
124 |           "y" : 0.75161290360794786
125 |         },
126 |         "topRight" : {
127 |           "y" : 0.69677419324689094,
128 |           "x" : 0.82558139034792843
129 |         },
130 |         "bottomLeft" : {
131 |           "x" : 0.090116302457303524,
132 |           "y" : 0.75161290360794786
133 |         },
134 |         "topLeft" : {
135 |           "y" : 0.69677419324689094,
136 |           "x" : 0.090116302457303524
137 |         }
138 |       },
139 |       "confidence" : 1
140 |     }
141 |   ],
142 |   "texts" : "The Llama 3.2-Vision collection of multimodal large language models (LLMS) is a\ncollection of instruction-tuned image reasoning generative models in lIb and 90B\nsizes (text + images in \/ text out). The Llama 3.2-Vision instruction-tuned models\nare optimized for visual recognition, image reasoning, captioning, and answering\ngeneral questions about an image. The models outperform many of the available\nopen source and closed multimodal models on common industry benchmarks."
143 | }


--------------------------------------------------------------------------------
/output/handwriting.webp.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "texts" : "The Llama 3.2-Vision Collection of multimodal large langyage model5 （LLMS） is a\ncollection of instruction-tuned image reasoning generative models in l1B and 90B\nsizes （text + images in \/ text ovt）. The Llama 3.2-Vision instruction-tuned models\nare optimized for visval recognittion, iage reasoning, captioning, and answering\ngeneral qvestions about an iage. The models outperform many of the available\nopen Source and Closed multimodal models on common industry benchmarKs.",
  3 |   "info" : {
  4 |     "height" : 720,
  5 |     "filename" : "handwriting.webp",
  6 |     "filepath" : "\/Users\/abao\/Documents\/Github\/macos-vision-ocr\/.\/images\/handwriting.webp",
  7 |     "width" : 1600
  8 |   },
  9 |   "observations" : [
 10 |     {
 11 |       "confidence" : 0.5,
 12 |       "text" : "The Llama 3.2-Vision Collection of multimodal large langyage model5 （LLMS） is a",
 13 |       "quad" : {
 14 |         "bottomLeft" : {
 15 |           "y" : 0.35483871098527953,
 16 |           "x" : 0.09011629800287288
 17 |         },
 18 |         "topRight" : {
 19 |           "x" : 0.87936045388666206,
 20 |           "y" : 0.28333333395755611
 21 |         },
 22 |         "topLeft" : {
 23 |           "y" : 0.28333333395755611,
 24 |           "x" : 0.09011629800287288
 25 |         },
 26 |         "bottomRight" : {
 27 |           "x" : 0.87936045388666206,
 28 |           "y" : 0.35483871098527953
 29 |         }
 30 |       }
 31 |     },
 32 |     {
 33 |       "text" : "collection of instruction-tuned image reasoning generative models in l1B and 90B",
 34 |       "confidence" : 0.5,
 35 |       "quad" : {
 36 |         "bottomLeft" : {
 37 |           "x" : 0.090116268965468371,
 38 |           "y" : 0.43225806298465586
 39 |         },
 40 |         "bottomRight" : {
 41 |           "y" : 0.43225806298465586,
 42 |           "x" : 0.88372092899964794
 43 |         },
 44 |         "topLeft" : {
 45 |           "x" : 0.090116268965468371,
 46 |           "y" : 0.36774193719967596
 47 |         },
 48 |         "topRight" : {
 49 |           "x" : 0.88372092899964794,
 50 |           "y" : 0.36774193719967596
 51 |         }
 52 |       }
 53 |     },
 54 |     {
 55 |       "quad" : {
 56 |         "bottomRight" : {
 57 |           "x" : 0.90988372519304139,
 58 |           "y" : 0.51290322458603832
 59 |         },
 60 |         "topRight" : {
 61 |           "y" : 0.44838709880105843,
 62 |           "x" : 0.90988372519304139
 63 |         },
 64 |         "topLeft" : {
 65 |           "y" : 0.44838709880105843,
 66 |           "x" : 0.090116269138353855
 67 |         },
 68 |         "bottomLeft" : {
 69 |           "x" : 0.090116269138353855,
 70 |           "y" : 0.51290322458603832
 71 |         }
 72 |       },
 73 |       "text" : "sizes （text + images in \/ text ovt）. The Llama 3.2-Vision instruction-tuned models",
 74 |       "confidence" : 0.5
 75 |     },
 76 |     {
 77 |       "confidence" : 0.5,
 78 |       "text" : "are optimized for visval recognittion, iage reasoning, captioning, and answering",
 79 |       "quad" : {
 80 |         "topLeft" : {
 81 |           "x" : 0.090116283028365843,
 82 |           "y" : 0.52580645030504303
 83 |         },
 84 |         "topRight" : {
 85 |           "x" : 0.87790696296000648,
 86 |           "y" : 0.52580645030504303
 87 |         },
 88 |         "bottomLeft" : {
 89 |           "x" : 0.090116283028365843,
 90 |           "y" : 0.59722222168345418
 91 |         },
 92 |         "bottomRight" : {
 93 |           "y" : 0.59722222168345418,
 94 |           "x" : 0.87790696296000648
 95 |         }
 96 |       }
 97 |     },
 98 |     {
 99 |       "confidence" : 0.5,
100 |       "text" : "general qvestions about an iage. The models outperform many of the available",
101 |       "quad" : {
102 |         "topLeft" : {
103 |           "x" : 0.090116291199582074,
104 |           "y" : 0.61250000048202458
105 |         },
106 |         "bottomRight" : {
107 |           "x" : 0.89534883514489449,
108 |           "y" : 0.67777777825980234
109 |         },
110 |         "topRight" : {
111 |           "y" : 0.61250000048202458,
112 |           "x" : 0.89534883514489449
113 |         },
114 |         "bottomLeft" : {
115 |           "y" : 0.67777777825980234,
116 |           "x" : 0.090116291199582074
117 |         }
118 |       }
119 |     },
120 |     {
121 |       "confidence" : 0.5,
122 |       "quad" : {
123 |         "bottomLeft" : {
124 |           "y" : 0.75161290360794786,
125 |           "x" : 0.090116302457303524
126 |         },
127 |         "topLeft" : {
128 |           "y" : 0.69677419324689094,
129 |           "x" : 0.090116302457303524
130 |         },
131 |         "topRight" : {
132 |           "x" : 0.82558139034792843,
133 |           "y" : 0.69677419324689094
134 |         },
135 |         "bottomRight" : {
136 |           "y" : 0.75161290360794786,
137 |           "x" : 0.82558139034792843
138 |         }
139 |       },
140 |       "text" : "open Source and Closed multimodal models on common industry benchmarKs."
141 |     }
142 |   ]
143 | }


--------------------------------------------------------------------------------
/output/macos-vision-ocr.jpg.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "texts" : "MacOS Vision OCR\nA powerful command-line OCR tool built with Apple's Vision framework, supporting single\nimage and batch processing with detailed positional information output.\nFeatures\n• Support for multiple image formats （JPG, JPEG, PNG, WEBP）\n|• Single image and batch processing modes\n• Multi-language recognition （Simplified Chinese, Traditional Chinese, English，\nJapanese）\n• Detailed JSON output with text positions and confidence scores\n• Debug mode with visual bounding boxes\n• Support for both arm64 and x86_64 architectures",
  3 |   "info" : {
  4 |     "width" : 1782,
  5 |     "filename" : "macos-vision-ocr.jpg",
  6 |     "filepath" : "\/Users\/abao\/Documents\/Github\/macos-vision-ocr\/.\/images\/macos-vision-ocr.jpg",
  7 |     "height" : 970
  8 |   },
  9 |   "observations" : [
 10 |     {
 11 |       "quad" : {
 12 |         "bottomRight" : {
 13 |           "x" : 0.40406976816030932,
 14 |           "y" : 0.090721649676073324
 15 |         },
 16 |         "topLeft" : {
 17 |           "x" : 0.010174420916166525,
 18 |           "y" : 0.021333332843206487
 19 |         },
 20 |         "bottomLeft" : {
 21 |           "y" : 0.090721649676073324,
 22 |           "x" : 0.010174420916166525
 23 |         },
 24 |         "topRight" : {
 25 |           "x" : 0.40406976816030932,
 26 |           "y" : 0.021333332843206487
 27 |         }
 28 |       },
 29 |       "confidence" : 0.5,
 30 |       "text" : "MacOS Vision OCR"
 31 |     },
 32 |     {
 33 |       "quad" : {
 34 |         "bottomRight" : {
 35 |           "y" : 0.24536082440269502,
 36 |           "x" : 0.94476745096436332
 37 |         },
 38 |         "topLeft" : {
 39 |           "y" : 0.19175257698001458,
 40 |           "x" : 0.0087209234057915377
 41 |         },
 42 |         "bottomLeft" : {
 43 |           "x" : 0.0087209234057915377,
 44 |           "y" : 0.24536082440269502
 45 |         },
 46 |         "topRight" : {
 47 |           "x" : 0.94476745096436332,
 48 |           "y" : 0.19175257698001458
 49 |         }
 50 |       },
 51 |       "text" : "A powerful command-line OCR tool built with Apple's Vision framework, supporting single",
 52 |       "confidence" : 0.5
 53 |     },
 54 |     {
 55 |       "confidence" : 0.5,
 56 |       "text" : "image and batch processing with detailed positional information output.",
 57 |       "quad" : {
 58 |         "topRight" : {
 59 |           "x" : 0.75872090677282367,
 60 |           "y" : 0.26082474276662926
 61 |         },
 62 |         "bottomRight" : {
 63 |           "x" : 0.75872090677282367,
 64 |           "y" : 0.30666666572327284
 65 |         },
 66 |         "bottomLeft" : {
 67 |           "y" : 0.30666666572327284,
 68 |           "x" : 0.0058139618899673484
 69 |         },
 70 |         "topLeft" : {
 71 |           "x" : 0.0058139618899673484,
 72 |           "y" : 0.26082474276662926
 73 |         }
 74 |       }
 75 |     },
 76 |     {
 77 |       "confidence" : 1,
 78 |       "quad" : {
 79 |         "topRight" : {
 80 |           "y" : 0.39999999921860963,
 81 |           "x" : 0.14680232577338936
 82 |         },
 83 |         "bottomRight" : {
 84 |           "x" : 0.14680232577338936,
 85 |           "y" : 0.45066666672424749
 86 |         },
 87 |         "topLeft" : {
 88 |           "x" : 0.008720928234085731,
 89 |           "y" : 0.39999999921860963
 90 |         },
 91 |         "bottomLeft" : {
 92 |           "x" : 0.008720928234085731,
 93 |           "y" : 0.45066666672424749
 94 |         }
 95 |       },
 96 |       "text" : "Features"
 97 |     },
 98 |     {
 99 |       "confidence" : 0.5,
100 |       "text" : "• Support for multiple image formats （JPG, JPEG, PNG, WEBP）",
101 |       "quad" : {
102 |         "bottomRight" : {
103 |           "x" : 0.7093023117874665,
104 |           "y" : 0.59733333293108926
105 |         },
106 |         "bottomLeft" : {
107 |           "x" : 0.050872103125324253,
108 |           "y" : 0.59733333293108926
109 |         },
110 |         "topRight" : {
111 |           "x" : 0.7093023117874665,
112 |           "y" : 0.54399999933557741
113 |         },
114 |         "topLeft" : {
115 |           "x" : 0.050872103125324253,
116 |           "y" : 0.54399999933557741
117 |         }
118 |       }
119 |     },
120 |     {
121 |       "text" : "|• Single image and batch processing modes",
122 |       "confidence" : 0.5,
123 |       "quad" : {
124 |         "bottomLeft" : {
125 |           "x" : 0.021802331830222881,
126 |           "y" : 0.65599999999726333
127 |         },
128 |         "topLeft" : {
129 |           "x" : 0.021802331830222881,
130 |           "y" : 0.60515463948713533
131 |         },
132 |         "bottomRight" : {
133 |           "x" : 0.5174418460040795,
134 |           "y" : 0.65599999999726333
135 |         },
136 |         "topRight" : {
137 |           "y" : 0.60515463948713533,
138 |           "x" : 0.5174418460040795
139 |         }
140 |       }
141 |     },
142 |     {
143 |       "text" : "• Multi-language recognition （Simplified Chinese, Traditional Chinese, English，",
144 |       "quad" : {
145 |         "topLeft" : {
146 |           "x" : 0.049418620281314572,
147 |           "y" : 0.67731958853215035
148 |         },
149 |         "bottomLeft" : {
150 |           "x" : 0.049418620281314572,
151 |           "y" : 0.7309278359548308
152 |         },
153 |         "topRight" : {
154 |           "x" : 0.87209299587317068,
155 |           "y" : 0.67731958853215035
156 |         },
157 |         "bottomRight" : {
158 |           "x" : 0.87209299587317068,
159 |           "y" : 0.7309278359548308
160 |         }
161 |       },
162 |       "confidence" : 0.5
163 |     },
164 |     {
165 |       "text" : "Japanese）",
166 |       "confidence" : 1,
167 |       "quad" : {
168 |         "topLeft" : {
169 |           "y" : 0.74594523907006915,
170 |           "x" : 0.072753072485975179
171 |         },
172 |         "topRight" : {
173 |           "y" : 0.74738910395144698,
174 |           "x" : 0.18321579209608452
175 |         },
176 |         "bottomRight" : {
177 |           "y" : 0.78738809443310875,
178 |           "x" : 0.18306087942362423
179 |         },
180 |         "bottomLeft" : {
181 |           "y" : 0.78594422955173093,
182 |           "x" : 0.072598159813514904
183 |         }
184 |       }
185 |     },
186 |     {
187 |       "quad" : {
188 |         "bottomLeft" : {
189 |           "x" : 0.050872088433827421,
190 |           "y" : 0.85360824835628779
191 |         },
192 |         "topRight" : {
193 |           "y" : 0.81030927928412277,
194 |           "x" : 0.74273257425997075
195 |         },
196 |         "bottomRight" : {
197 |           "x" : 0.74273257425997075,
198 |           "y" : 0.85360824835628779
199 |         },
200 |         "topLeft" : {
201 |           "x" : 0.050872088433827421,
202 |           "y" : 0.81030927928412277
203 |         }
204 |       },
205 |       "confidence" : 0.5,
206 |       "text" : "• Detailed JSON output with text positions and confidence scores"
207 |     },
208 |     {
209 |       "text" : "• Debug mode with visual bounding boxes",
210 |       "confidence" : 0.5,
211 |       "quad" : {
212 |         "topLeft" : {
213 |           "y" : 0.87731958773997709,
214 |           "x" : 0.049418605301475543
215 |         },
216 |         "bottomRight" : {
217 |           "x" : 0.49709301514426102,
218 |           "y" : 0.92268041248224508
219 |         },
220 |         "bottomLeft" : {
221 |           "y" : 0.92268041248224508,
222 |           "x" : 0.049418605301475543
223 |         },
224 |         "topRight" : {
225 |           "x" : 0.49709301514426102,
226 |           "y" : 0.87731958773997709
227 |         }
228 |       }
229 |     },
230 |     {
231 |       "text" : "• Support for both arm64 and x86_64 architectures",
232 |       "quad" : {
233 |         "topLeft" : {
234 |           "x" : 0.049418607200380195,
235 |           "y" : 0.94400000135845674
236 |         },
237 |         "bottomLeft" : {
238 |           "x" : 0.049418607200380195,
239 |           "y" : 0.99199999884154311
240 |         },
241 |         "bottomRight" : {
242 |           "y" : 0.99199999884154311,
243 |           "x" : 0.59738371153145142
244 |         },
245 |         "topRight" : {
246 |           "y" : 0.94400000135845674,
247 |           "x" : 0.59738371153145142
248 |         }
249 |       },
250 |       "confidence" : 0.5
251 |     }
252 |   ]
253 | }


--------------------------------------------------------------------------------
/output/merged_output.txt:
--------------------------------------------------------------------------------
 1 | The Llama 3.2-Vision Collection of multimodal large langyage model5 （LLMS） is a
 2 | collection of instruction-tuned image reasoning generative models in l1B and 90B
 3 | sizes （text + images in / text ovt）. The Llama 3.2-Vision instruction-tuned models
 4 | are optimized for visval recognittion, iage reasoning, captioning, and answering
 5 | general qvestions about an iage. The models outperform many of the available
 6 | open Source and Closed multimodal models on common industry benchmarKs.
 7 | 
 8 | MacOS Vision OCR
 9 | A powerful command-line OCR tool built with Apple's Vision framework, supporting single
10 | image and batch processing with detailed positional information output.
11 | Features
12 | • Support for multiple image formats （JPG, JPEG, PNG, WEBP）
13 | |• Single image and batch processing modes
14 | • Multi-language recognition （Simplified Chinese, Traditional Chinese, English，
15 | Japanese）
16 | • Detailed JSON output with text positions and confidence scores
17 | • Debug mode with visual bounding boxes
18 | • Support for both arm64 and x86_64 architectures
19 | 
20 | 


--------------------------------------------------------------------------------