├── .github └── workflows │ └── release.yml ├── .gitignore ├── LICENSE ├── Package.swift ├── README.md ├── Sources └── ocr.swift ├── images ├── handwriting.json ├── handwriting.webp ├── handwriting_boxes.png └── macos-vision-ocr.jpg └── output ├── handwriting.json ├── handwriting.webp.json ├── macos-vision-ocr.jpg.json └── merged_output.txt /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release Build 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | platform: 7 | description: 'Build Platform' 8 | required: true 9 | default: 'all' 10 | type: choice 11 | options: 12 | - all 13 | - arm64 14 | - x86_64 15 | version: 16 | description: 'Version (e.g. v1.0.0)' 17 | required: true 18 | type: string 19 | default: 'v1.0.0' 20 | 21 | jobs: 22 | build-and-release: 23 | runs-on: macos-latest 24 | permissions: 25 | contents: write 26 | steps: 27 | - uses: actions/checkout@v3 28 | 29 | - name: Set up Swift 30 | uses: swift-actions/setup-swift@v1 31 | with: 32 | swift-version: "5.9" 33 | 34 | - name: Build for arm64 35 | if: github.event.inputs.platform == 'arm64' || github.event.inputs.platform == 'all' 36 | run: | 37 | swift build -c release --arch arm64 38 | mv .build/release/macos-vision-ocr .build/release/macos-vision-ocr-arm64 39 | zip -j macos-vision-ocr-arm64-${{ github.event.inputs.version }}.zip .build/release/macos-vision-ocr-arm64 40 | 41 | - name: Build for x86_64 42 | if: github.event.inputs.platform == 'x86_64' || github.event.inputs.platform == 'all' 43 | run: | 44 | swift build -c release --arch x86_64 45 | mv .build/release/macos-vision-ocr .build/release/macos-vision-ocr-x86_64 46 | zip -j macos-vision-ocr-x86_64-${{ github.event.inputs.version }}.zip .build/release/macos-vision-ocr-x86_64 47 | 48 | - name: Create Release 49 | uses: softprops/action-gh-release@v1 50 | with: 51 | tag_name: ${{ github.event.inputs.version }} 52 | name: Release ${{ github.event.inputs.version }} 53 | draft: false 54 | prerelease: false 55 | files: | 56 | macos-vision-ocr-*-${{ github.event.inputs.version }}.zip -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | *.log 3 | dist 4 | .cache 5 | playground 6 | .idea 7 | .DS_Store 8 | .eslintcache 9 | .build 10 | .vscode 11 | Package.resolved -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright © 2021 EGOIST (https://github.com/sponsors/egoist) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.9 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "macos-vision-ocr", 8 | platforms: [ 9 | .macOS(.v10_15) 10 | ], 11 | dependencies: [ 12 | .package(url: "https://github.com/apple/swift-argument-parser", exact: "1.2.3") 13 | ], 14 | targets: [ 15 | // Targets are the basic building blocks of a package, defining a module or a test suite. 16 | // Targets can depend on other targets in this package and products from dependencies. 17 | .executableTarget( 18 | name: "macos-vision-ocr", 19 | dependencies: [ 20 | .product(name: "ArgumentParser", package: "swift-argument-parser") 21 | ] 22 | ), 23 | ] 24 | ) 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MacOS Vision OCR 2 | 3 | A powerful command-line OCR tool built with Apple's Vision framework, supporting single image and batch processing with detailed positional information output. 4 | 5 | ## Features 6 | 7 | - Support for multiple image formats (JPG, JPEG, PNG, WEBP) 8 | - Single image and batch processing modes 9 | - Multi-language recognition (supporting 16 languages including English, Chinese, Japanese, Korean, and European languages) 10 | - Detailed JSON output with text positions and confidence scores 11 | - Debug mode with visual bounding boxes 12 | - Support for both arm64 and x86_64 architectures 13 | 14 | ## System Requirements 15 | 16 | - macOS 10.15 or later 17 | - Support for arm64 (Apple Silicon) or x86_64 (Intel) architecture 18 | 19 | > It is recommended that macOS 13 or later be used in preference to macOS 13 or later for the best OCR recognition. 20 | 21 | ## Installation 22 | 23 | ### Build from Source 24 | 25 | 1. Ensure Xcode and Command Line Tools are installed 26 | 27 | 2. Clone the repository: 28 | 29 | ```bash 30 | git clone https://github.com/your-username/macos-vision-ocr.git 31 | cd macos-vision-ocr 32 | ``` 33 | 34 | 3. Build for your architecture: 35 | 36 | For Apple Silicon (arm64): 37 | 38 | ```bash 39 | swift build -c release --arch arm64 40 | ``` 41 | 42 | For Intel (x86_64): 43 | 44 | ```bash 45 | swift build -c release --arch x86_64 46 | ``` 47 | 48 | ## Usage 49 | 50 | ### Single Image Processing 51 | 52 | Process a single image and output to console: 53 | 54 | ```bash 55 | ./macos-vision-ocr --img ./images/handwriting.webp 56 | ``` 57 | 58 | Process with custom output directory: 59 | 60 | ```bash 61 | ./macos-vision-ocr --img ./images/handwriting.webp --output ./images 62 | ``` 63 | 64 | ### Set Recognition Languages 65 | 66 | Recognition languages can be specified using the `--rec-langs` option. For example: 67 | 68 | ```bash 69 | ./macos-vision-ocr --img ./images/handwriting.webp --rec-langs "zh-Hans, zh-Hant, en-US" 70 | ``` 71 | 72 | ### Batch Processing 73 | 74 | Process multiple images in a directory: 75 | 76 | ```bash 77 | ./macos-vision-ocr --img-dir ./images --output-dir ./output 78 | ``` 79 | 80 | Merge all results into a single file: 81 | 82 | ```bash 83 | ./macos-vision-ocr --img-dir ./images --output-dir ./output --merge 84 | ``` 85 | 86 | ### Debug Mode 87 | 88 | Enable debug mode to visualize text detection: 89 | 90 | ```bash 91 | ./macos-vision-ocr --img ./images/handwriting.webp --debug 92 | ``` 93 | 94 | ![handwriting_boxes.png](./images/handwriting_boxes.png) 95 | 96 | ### Command Line Options 97 | 98 | ``` 99 | Options: 100 | --img Path to a single image file 101 | --output Output directory for single image mode 102 | --img-dir Directory containing images for batch mode 103 | --output-dir Output directory for batch mode 104 | --merge Merge all text outputs into a single file in batch mode 105 | --debug Debug mode: Draw bounding boxes on the image 106 | --lang Show supported recognition languages 107 | --help Show help information 108 | ``` 109 | 110 | ## Output Format 111 | 112 | The tool outputs JSON with the following structure: 113 | 114 | ```json 115 | { 116 | "texts": "The Llama 3.2-Vision Collection of multimodal large langyage model5 (LLMS) is a\ncollection of instruction-tuned image reasoning generative models in l1B and 90B\nsizes (text + images in / text ovt). The Llama 3.2-Vision instruction-tuned models\nare optimized for visval recognittion, iage reasoning, captioning, and answering\ngeneral qvestions about an iage. The models outperform many of the available\nopen Source and Closed multimodal models on common industry benchmarKs.", 117 | "info": { 118 | "filepath": "./images/handwriting.webp", 119 | "width": 1600, 120 | "filename": "handwriting.webp", 121 | "height": 720 122 | }, 123 | "observations": [ 124 | { 125 | "text": "The Llama 3.2-Vision Collection of multimodal large langyage model5 (LLMS) is a", 126 | "confidence": 0.5, 127 | "quad": { 128 | "topLeft": { 129 | "y": 0.28333333395755611, 130 | "x": 0.09011629800287288 131 | }, 132 | "topRight": { 133 | "x": 0.87936045388666206, 134 | "y": 0.28333333395755611 135 | }, 136 | "bottomLeft": { 137 | "x": 0.09011629800287288, 138 | "y": 0.35483871098527953 139 | }, 140 | "bottomRight": { 141 | "x": 0.87936045388666206, 142 | "y": 0.35483871098527953 143 | } 144 | } 145 | } 146 | ] 147 | } 148 | ``` 149 | 150 | ## Debug Output 151 | 152 | When using `--debug`, the tool will: 153 | 154 | 1. Create a new image with "\_boxes.png" suffix 155 | 2. Draw red bounding boxes around detected text 156 | 3. Save the debug image in the same directory as the input image 157 | 158 | ## Supported Languages 159 | 160 | - English (en-US) 161 | - French (fr-FR) 162 | - Italian (it-IT) 163 | - German (de-DE) 164 | - Spanish (es-ES) 165 | - Portuguese (Brazil) (pt-BR) 166 | - Simplified Chinese (zh-Hans) 167 | - Traditional Chinese (zh-Hant) 168 | - Simplified Cantonese (yue-Hans) 169 | - Traditional Cantonese (yue-Hant) 170 | - Korean (ko-KR) 171 | - Japanese (ja-JP) 172 | - Russian (ru-RU) 173 | - Ukrainian (uk-UA) 174 | - Thai (th-TH) 175 | - Vietnamese (vi-VT) 176 | 177 | ## Node.js Integration Example 178 | 179 | Here's an example of how to use `macos-vision-ocr` in a Node.js application: 180 | 181 | ```javascript 182 | const { exec } = require("child_process"); 183 | const util = require("util"); 184 | const execPromise = util.promisify(exec); 185 | 186 | async function performOCR(imagePath, outputDir = null) { 187 | try { 188 | // Construct the command 189 | let command = `./macos-vision-ocr --img "${imagePath}"`; 190 | if (outputDir) { 191 | command += ` --output "${outputDir}"`; 192 | } 193 | 194 | // Execute the OCR command 195 | const { stdout, stderr } = await execPromise(command); 196 | 197 | if (stderr) { 198 | console.error("Error:", stderr); 199 | return null; 200 | } 201 | 202 | // Parse the JSON output 203 | console.log("stdout:", stdout); 204 | const result = JSON.parse(stdout); 205 | return result; 206 | } catch (error) { 207 | console.error("OCR processing failed:", error); 208 | return null; 209 | } 210 | } 211 | 212 | // Example usage 213 | async function example() { 214 | const result = await performOCR("./images/handwriting.webp"); 215 | if (result) { 216 | console.log("Extracted text:", result.texts); 217 | console.log("Text positions:", result.observations); 218 | } 219 | } 220 | 221 | example(); 222 | ``` 223 | 224 | ## Common Issues 225 | 226 | 1. **Image Loading Fails** 227 | 228 | - Ensure the image path is correct 229 | - Verify the image format is supported (JPG, JPEG, PNG, WEBP) 230 | - Check file permissions 231 | 232 | 2. **No Text Detected** 233 | - Ensure the image contains clear, readable text 234 | - Check if the text size is not too small (minimum text height is 1% of image height) 235 | - Verify the text language is supported 236 | 237 | ## License 238 | 239 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 240 | 241 | ## Acknowledgments 242 | 243 | Built with: 244 | 245 | - Apple Vision Framework 246 | - Swift Argument Parser 247 | - macOS Native APIs 248 | -------------------------------------------------------------------------------- /Sources/ocr.swift: -------------------------------------------------------------------------------- 1 | import Cocoa 2 | import Vision 3 | import ArgumentParser 4 | import Foundation 5 | 6 | @main 7 | struct MacOSVisionOCR: ParsableCommand { 8 | static var configuration = CommandConfiguration( 9 | commandName: "macos-vision-ocr", 10 | abstract: "Perform OCR on single image or batch of images" 11 | ) 12 | 13 | @Option(name: .long, help: "Path to a single image file") 14 | var img: String? 15 | 16 | @Option(name: .long, help: "Output directory for single image mode") 17 | var output: String? 18 | 19 | @Option(name: .long, help: "Directory containing images for batch mode") 20 | var imgDir: String? 21 | 22 | @Option(name: .long, help: "Output directory for batch mode") 23 | var outputDir: String? 24 | 25 | @Flag(name: .long, help: "Merge all text outputs into a single file in batch mode") 26 | var merge = false 27 | 28 | @Flag(name: .long, help: "Debug mode: Draw bounding boxes on the image") 29 | var debug = false 30 | 31 | @Flag(name: .long, help: "Show supported recognition languages") 32 | var lang = false 33 | 34 | @Option(name: .long, help: "Recognition languages") 35 | var recLangs: String? 36 | 37 | var revision: Int { 38 | var REVISION: Int 39 | if #available(macOS 13, *) { 40 | REVISION = VNRecognizeTextRequestRevision3 41 | } else if #available(macOS 11, *) { 42 | REVISION = VNRecognizeTextRequestRevision2 43 | } else { 44 | REVISION = VNRecognizeAnimalsRequestRevision1 45 | } 46 | return REVISION 47 | } 48 | 49 | private func isEmptyBox(_ box: VNRectangleObservation) -> Bool { 50 | let width = box.topRight.x - box.topLeft.x 51 | let height = box.topLeft.y - box.bottomLeft.y 52 | return width * height == 0 53 | } 54 | 55 | private func extractSubBounds(imageRef: CGImage, observation: VNRecognizedTextObservation, recognizedText: VNRecognizedText, positionalJson: inout [[String: Any]]) { 56 | func normalizeCoordinate(_ value: CGFloat) -> CGFloat { 57 | return max(0, min(1, value)) 58 | } 59 | 60 | let text = recognizedText.string 61 | let topLeft = observation.topLeft 62 | let topRight = observation.topRight 63 | let bottomRight = observation.bottomRight 64 | let bottomLeft = observation.bottomLeft 65 | 66 | let quad: [String: Any] = [ 67 | "topLeft": [ 68 | "x": normalizeCoordinate(topLeft.x), 69 | "y": normalizeCoordinate(1 - topLeft.y) 70 | ], 71 | "topRight": [ 72 | "x": normalizeCoordinate(topRight.x), 73 | "y": normalizeCoordinate(1 - topRight.y) 74 | ], 75 | "bottomRight": [ 76 | "x": normalizeCoordinate(bottomRight.x), 77 | "y": normalizeCoordinate(1 - bottomRight.y) 78 | ], 79 | "bottomLeft": [ 80 | "x": normalizeCoordinate(bottomLeft.x), 81 | "y": normalizeCoordinate(1 - bottomLeft.y) 82 | ] 83 | ] 84 | 85 | positionalJson.append([ 86 | "text": text, 87 | "confidence": observation.confidence, 88 | "quad": quad 89 | ]) 90 | } 91 | 92 | private func getSupportedLanguages() -> [String] { 93 | if #available(macOS 13, *) { 94 | let request = VNRecognizeTextRequest() 95 | do { 96 | return try request.supportedRecognitionLanguages() 97 | } catch { 98 | return ["zh-Hans", "zh-Hant", "en-US", "ja-JP"] 99 | } 100 | } else { 101 | return ["zh-Hans", "zh-Hant", "en-US", "ja-JP"] 102 | } 103 | } 104 | 105 | mutating func run() throws { 106 | if lang { 107 | let languages = getSupportedLanguages() 108 | print("Supported recognition languages:") 109 | languages.forEach { print("- \($0)") } 110 | return 111 | } 112 | 113 | if let img = img { 114 | try processSingleImage(img, outputDir: output) 115 | } else if let imgDir = imgDir { 116 | try processBatchImages(imgDir, outputDir: outputDir) 117 | } else { 118 | throw ValidationError("Either --img or --img-dir must be provided") 119 | } 120 | } 121 | 122 | private func processSingleImage(_ imagePath: String, outputDir: String?) throws { 123 | let jsonResult = try extractText(from: imagePath) 124 | 125 | if let outputDir = outputDir { 126 | let fileManager = FileManager.default 127 | if !fileManager.fileExists(atPath: outputDir) { 128 | try fileManager.createDirectory(atPath: outputDir, withIntermediateDirectories: true, attributes: nil) 129 | } 130 | let inputFileName = (imagePath as NSString).lastPathComponent 131 | let outputFileName = (inputFileName as NSString).deletingPathExtension + ".json" 132 | let outputPath = (outputDir as NSString).appendingPathComponent(outputFileName) 133 | try jsonResult.write(toFile: outputPath, atomically: true, encoding: .utf8) 134 | print("OCR result saved to: \(outputPath)") 135 | } else { 136 | print(jsonResult) 137 | } 138 | 139 | if debug { 140 | try drawDebugImage(imagePath: imagePath, jsonResult: jsonResult) 141 | } 142 | } 143 | 144 | private func processBatchImages(_ imgDir: String, outputDir: String?) throws { 145 | let fileManager = FileManager.default 146 | 147 | if let outputDir = outputDir { 148 | if !fileManager.fileExists(atPath: outputDir) { 149 | try fileManager.createDirectory(atPath: outputDir, withIntermediateDirectories: true, attributes: nil) 150 | } 151 | } 152 | 153 | let enumerator = fileManager.enumerator(atPath: imgDir) 154 | var imageFiles: [String] = [] 155 | 156 | while let filePath = enumerator?.nextObject() as? String { 157 | if isImageFile(filePath) { 158 | imageFiles.append(filePath) 159 | } 160 | } 161 | 162 | imageFiles.sort() 163 | var mergedText = "" 164 | 165 | for imagePath in imageFiles { 166 | let fullImagePath = (imgDir as NSString).appendingPathComponent(imagePath) 167 | let jsonResult = try extractText(from: fullImagePath) 168 | 169 | if let outputDir = outputDir { 170 | let outputPath = (outputDir as NSString).appendingPathComponent((imagePath as NSString).lastPathComponent + ".json") 171 | try jsonResult.write(toFile: outputPath, atomically: true, encoding: .utf8) 172 | } 173 | 174 | if merge { 175 | if let data = jsonResult.data(using: .utf8), 176 | let json = try JSONSerialization.jsonObject(with: data) as? [String: Any], 177 | let text = json["texts"] as? String { 178 | mergedText += text + "\n\n" 179 | } 180 | } 181 | 182 | if debug { 183 | try drawDebugImage(imagePath: fullImagePath, jsonResult: jsonResult) 184 | } 185 | } 186 | 187 | if merge, let outputDir = outputDir { 188 | let mergedPath = (outputDir as NSString).appendingPathComponent("merged_output.txt") 189 | try mergedText.write(toFile: mergedPath, atomically: true, encoding: .utf8) 190 | } 191 | } 192 | 193 | private func isImageFile(_ filePath: String) -> Bool { 194 | let imageExtensions = ["jpg", "jpeg", "png", "webp"] 195 | return imageExtensions.contains((filePath as NSString).pathExtension.lowercased()) 196 | } 197 | 198 | private func extractText(from imagePath: String) throws -> String { 199 | guard let img = NSImage(byReferencingFile: imagePath) else { 200 | throw OCRError.imageLoadFailed(path: imagePath) 201 | } 202 | 203 | guard let cgImage = img.cgImage(forProposedRect: nil, context: nil, hints: nil) else { 204 | throw OCRError.imageConversionFailed(path: imagePath) 205 | } 206 | 207 | let request = VNRecognizeTextRequest() 208 | request.recognitionLevel = .accurate 209 | request.usesLanguageCorrection = true 210 | 211 | // Use recLangs if provided, otherwise use supported languages 212 | if let recLangs = recLangs { 213 | let languages = recLangs 214 | .components(separatedBy: ",") 215 | .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) } 216 | .filter { !$0.isEmpty } 217 | request.recognitionLanguages = languages 218 | } else { 219 | request.recognitionLanguages = getSupportedLanguages() 220 | } 221 | 222 | request.revision = revision 223 | 224 | request.minimumTextHeight = 0.01 225 | 226 | let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) 227 | try handler.perform([request]) 228 | 229 | guard let observations = request.results else { 230 | throw OCRError.noTextFound 231 | } 232 | 233 | var positionalJson: [[String: Any]] = [] 234 | var fullText: [String] = [] 235 | 236 | for observation in observations { 237 | guard let candidate = observation.topCandidates(1).first else { continue } 238 | fullText.append(candidate.string) 239 | extractSubBounds(imageRef: cgImage, observation: observation, recognizedText: candidate, positionalJson: &positionalJson) 240 | } 241 | 242 | let combinedFullText = fullText.joined(separator: "\n") 243 | 244 | let fileManager = FileManager.default 245 | let absolutePath = (fileManager.currentDirectoryPath as NSString).appendingPathComponent(imagePath) 246 | 247 | let info: [String: Any] = [ 248 | "filename": (imagePath as NSString).lastPathComponent, 249 | "filepath": absolutePath, 250 | "width": cgImage.width, 251 | "height": cgImage.height 252 | ] 253 | 254 | let result: [String: Any] = [ 255 | "info": info, 256 | "observations": positionalJson, 257 | "texts": combinedFullText 258 | ] 259 | 260 | let jsonData = try JSONSerialization.data(withJSONObject: result, options: .prettyPrinted) 261 | return String(data: jsonData, encoding: .utf8) ?? "" 262 | } 263 | 264 | private func drawDebugImage(imagePath: String, jsonResult: String) throws { 265 | guard let image = NSImage(contentsOfFile: imagePath) else { 266 | throw OCRError.imageLoadFailed(path: imagePath) 267 | } 268 | 269 | let size = image.size 270 | let imageRect = CGRect(origin: .zero, size: size) 271 | 272 | let newImage = NSImage(size: size) 273 | newImage.lockFocus() 274 | 275 | // Draw original image 276 | image.draw(in: imageRect) 277 | 278 | // Parse JSON result 279 | guard let data = jsonResult.data(using: .utf8), 280 | let json = try JSONSerialization.jsonObject(with: data) as? [String: Any], 281 | let observations = json["observations"] as? [[String: Any]] else { 282 | throw OCRError.jsonParsingFailed 283 | } 284 | 285 | // Set up drawing context 286 | NSColor.red.setStroke() 287 | let context = NSGraphicsContext.current!.cgContext 288 | context.setLineWidth(1.0) 289 | 290 | // Draw quadrilaterals 291 | for observation in observations { 292 | guard let quad = observation["quad"] as? [String: [String: CGFloat]] else { continue } 293 | 294 | let topLeft = CGPoint(x: quad["topLeft"]!["x"]! * size.width, y: (1 - quad["topLeft"]!["y"]!) * size.height) 295 | let topRight = CGPoint(x: quad["topRight"]!["x"]! * size.width, y: (1 - quad["topRight"]!["y"]!) * size.height) 296 | let bottomRight = CGPoint(x: quad["bottomRight"]!["x"]! * size.width, y: (1 - quad["bottomRight"]!["y"]!) * size.height) 297 | let bottomLeft = CGPoint(x: quad["bottomLeft"]!["x"]! * size.width, y: (1 - quad["bottomLeft"]!["y"]!) * size.height) 298 | 299 | context.beginPath() 300 | context.move(to: topLeft) 301 | context.addLine(to: topRight) 302 | context.addLine(to: bottomRight) 303 | context.addLine(to: bottomLeft) 304 | context.closePath() 305 | context.strokePath() 306 | } 307 | 308 | newImage.unlockFocus() 309 | 310 | // Save the new image 311 | let outputFileName = (imagePath as NSString).deletingPathExtension + "_boxes.png" 312 | guard let pngData = newImage.tiffRepresentation, 313 | let bitmap = NSBitmapImageRep(data: pngData), 314 | let pngData = bitmap.representation(using: .png, properties: [:]) else { 315 | throw OCRError.imageConversionFailed(path: outputFileName) 316 | } 317 | 318 | try pngData.write(to: URL(fileURLWithPath: outputFileName)) 319 | print("Debug image saved to: \(outputFileName)") 320 | } 321 | } 322 | 323 | enum OCRError: Error { 324 | case imageLoadFailed(path: String) 325 | case imageConversionFailed(path: String) 326 | case jsonParsingFailed 327 | case noTextFound 328 | } 329 | -------------------------------------------------------------------------------- /images/handwriting.json: -------------------------------------------------------------------------------- 1 | { 2 | "observations" : [ 3 | { 4 | "confidence" : 1, 5 | "quad" : { 6 | "bottomLeft" : { 7 | "x" : 0.09011629800287288, 8 | "y" : 0.35483871098527953 9 | }, 10 | "topRight" : { 11 | "y" : 0.28333333395755611, 12 | "x" : 0.87936045388666206 13 | }, 14 | "bottomRight" : { 15 | "x" : 0.87936045388666206, 16 | "y" : 0.35483871098527953 17 | }, 18 | "topLeft" : { 19 | "x" : 0.09011629800287288, 20 | "y" : 0.28333333395755611 21 | } 22 | }, 23 | "text" : "The Llama 3.2-Vision collection of multimodal large language models (LLMS) is a" 24 | }, 25 | { 26 | "text" : "collection of instruction-tuned image reasoning generative models in lIb and 90B", 27 | "confidence" : 1, 28 | "quad" : { 29 | "bottomLeft" : { 30 | "x" : 0.090116268965468371, 31 | "y" : 0.43225806298465586 32 | }, 33 | "topRight" : { 34 | "y" : 0.36774193719967596, 35 | "x" : 0.88372092899964794 36 | }, 37 | "topLeft" : { 38 | "x" : 0.090116268965468371, 39 | "y" : 0.36774193719967596 40 | }, 41 | "bottomRight" : { 42 | "x" : 0.88372092899964794, 43 | "y" : 0.43225806298465586 44 | } 45 | } 46 | }, 47 | { 48 | "quad" : { 49 | "topRight" : { 50 | "y" : 0.44838709880105843, 51 | "x" : 0.90988372519304139 52 | }, 53 | "bottomLeft" : { 54 | "x" : 0.090116269138353855, 55 | "y" : 0.51290322458603832 56 | }, 57 | "topLeft" : { 58 | "y" : 0.44838709880105843, 59 | "x" : 0.090116269138353855 60 | }, 61 | "bottomRight" : { 62 | "y" : 0.51290322458603832, 63 | "x" : 0.90988372519304139 64 | } 65 | }, 66 | "confidence" : 1, 67 | "text" : "sizes (text + images in \/ text out). The Llama 3.2-Vision instruction-tuned models" 68 | }, 69 | { 70 | "quad" : { 71 | "bottomRight" : { 72 | "y" : 0.59722222168345418, 73 | "x" : 0.87790696296000648 74 | }, 75 | "topRight" : { 76 | "y" : 0.52580645030504303, 77 | "x" : 0.87790696296000648 78 | }, 79 | "bottomLeft" : { 80 | "x" : 0.090116283028365843, 81 | "y" : 0.59722222168345418 82 | }, 83 | "topLeft" : { 84 | "x" : 0.090116283028365843, 85 | "y" : 0.52580645030504303 86 | } 87 | }, 88 | "confidence" : 1, 89 | "text" : "are optimized for visual recognition, image reasoning, captioning, and answering" 90 | }, 91 | { 92 | "quad" : { 93 | "topLeft" : { 94 | "y" : 0.61250000048202458, 95 | "x" : 0.090116291199582074 96 | }, 97 | "bottomRight" : { 98 | "x" : 0.89534883514489449, 99 | "y" : 0.67777777825980234 100 | }, 101 | "bottomLeft" : { 102 | "y" : 0.67777777825980234, 103 | "x" : 0.090116291199582074 104 | }, 105 | "topRight" : { 106 | "x" : 0.89534883514489449, 107 | "y" : 0.61250000048202458 108 | } 109 | }, 110 | "text" : "general questions about an image. The models outperform many of the available", 111 | "confidence" : 1 112 | }, 113 | { 114 | "text" : "open source and closed multimodal models on common industry benchmarks.", 115 | "confidence" : 1, 116 | "quad" : { 117 | "topRight" : { 118 | "y" : 0.69677419324689094, 119 | "x" : 0.82558139034792843 120 | }, 121 | "bottomLeft" : { 122 | "x" : 0.090116302457303524, 123 | "y" : 0.75161290360794786 124 | }, 125 | "bottomRight" : { 126 | "x" : 0.82558139034792843, 127 | "y" : 0.75161290360794786 128 | }, 129 | "topLeft" : { 130 | "y" : 0.69677419324689094, 131 | "x" : 0.090116302457303524 132 | } 133 | } 134 | } 135 | ], 136 | "texts" : "The Llama 3.2-Vision collection of multimodal large language models (LLMS) is a\ncollection of instruction-tuned image reasoning generative models in lIb and 90B\nsizes (text + images in \/ text out). The Llama 3.2-Vision instruction-tuned models\nare optimized for visual recognition, image reasoning, captioning, and answering\ngeneral questions about an image. The models outperform many of the available\nopen source and closed multimodal models on common industry benchmarks.", 137 | "info" : { 138 | "height" : 720, 139 | "filename" : "handwriting.webp", 140 | "filepath" : "\/Users\/abao\/Documents\/Github\/macos-vision-ocr\/.\/images\/handwriting.webp", 141 | "width" : 1600 142 | } 143 | } -------------------------------------------------------------------------------- /images/handwriting.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytefer/macos-vision-ocr/91a236a3193512430df7255e5c84018dc2ecbafc/images/handwriting.webp -------------------------------------------------------------------------------- /images/handwriting_boxes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytefer/macos-vision-ocr/91a236a3193512430df7255e5c84018dc2ecbafc/images/handwriting_boxes.png -------------------------------------------------------------------------------- /images/macos-vision-ocr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bytefer/macos-vision-ocr/91a236a3193512430df7255e5c84018dc2ecbafc/images/macos-vision-ocr.jpg -------------------------------------------------------------------------------- /output/handwriting.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "filename" : "handwriting.webp", 4 | "width" : 1600, 5 | "height" : 720, 6 | "filepath" : "\/Users\/abao\/Documents\/Github\/macos-vision-ocr\/.\/images\/handwriting.webp" 7 | }, 8 | "observations" : [ 9 | { 10 | "text" : "The Llama 3.2-Vision collection of multimodal large language models (LLMS) is a", 11 | "confidence" : 1, 12 | "quad" : { 13 | "topLeft" : { 14 | "y" : 0.28333333395755611, 15 | "x" : 0.09011629800287288 16 | }, 17 | "topRight" : { 18 | "x" : 0.87936045388666206, 19 | "y" : 0.28333333395755611 20 | }, 21 | "bottomRight" : { 22 | "y" : 0.35483871098527953, 23 | "x" : 0.87936045388666206 24 | }, 25 | "bottomLeft" : { 26 | "x" : 0.09011629800287288, 27 | "y" : 0.35483871098527953 28 | } 29 | } 30 | }, 31 | { 32 | "quad" : { 33 | "topRight" : { 34 | "x" : 0.88372092899964794, 35 | "y" : 0.36774193719967596 36 | }, 37 | "bottomRight" : { 38 | "y" : 0.43225806298465586, 39 | "x" : 0.88372092899964794 40 | }, 41 | "bottomLeft" : { 42 | "x" : 0.090116268965468371, 43 | "y" : 0.43225806298465586 44 | }, 45 | "topLeft" : { 46 | "y" : 0.36774193719967596, 47 | "x" : 0.090116268965468371 48 | } 49 | }, 50 | "confidence" : 1, 51 | "text" : "collection of instruction-tuned image reasoning generative models in lIb and 90B" 52 | }, 53 | { 54 | "text" : "sizes (text + images in \/ text out). The Llama 3.2-Vision instruction-tuned models", 55 | "quad" : { 56 | "bottomLeft" : { 57 | "x" : 0.090116269138353855, 58 | "y" : 0.51290322458603832 59 | }, 60 | "topLeft" : { 61 | "y" : 0.44838709880105843, 62 | "x" : 0.090116269138353855 63 | }, 64 | "topRight" : { 65 | "x" : 0.90988372519304139, 66 | "y" : 0.44838709880105843 67 | }, 68 | "bottomRight" : { 69 | "x" : 0.90988372519304139, 70 | "y" : 0.51290322458603832 71 | } 72 | }, 73 | "confidence" : 1 74 | }, 75 | { 76 | "quad" : { 77 | "topLeft" : { 78 | "x" : 0.090116283028365843, 79 | "y" : 0.52580645030504303 80 | }, 81 | "bottomRight" : { 82 | "y" : 0.59722222168345418, 83 | "x" : 0.87790696296000648 84 | }, 85 | "bottomLeft" : { 86 | "x" : 0.090116283028365843, 87 | "y" : 0.59722222168345418 88 | }, 89 | "topRight" : { 90 | "x" : 0.87790696296000648, 91 | "y" : 0.52580645030504303 92 | } 93 | }, 94 | "text" : "are optimized for visual recognition, image reasoning, captioning, and answering", 95 | "confidence" : 1 96 | }, 97 | { 98 | "text" : "general questions about an image. The models outperform many of the available", 99 | "confidence" : 1, 100 | "quad" : { 101 | "topRight" : { 102 | "x" : 0.89534883514489449, 103 | "y" : 0.61250000048202458 104 | }, 105 | "bottomLeft" : { 106 | "y" : 0.67777777825980234, 107 | "x" : 0.090116291199582074 108 | }, 109 | "topLeft" : { 110 | "y" : 0.61250000048202458, 111 | "x" : 0.090116291199582074 112 | }, 113 | "bottomRight" : { 114 | "x" : 0.89534883514489449, 115 | "y" : 0.67777777825980234 116 | } 117 | } 118 | }, 119 | { 120 | "text" : "open source and closed multimodal models on common industry benchmarks.", 121 | "quad" : { 122 | "bottomRight" : { 123 | "x" : 0.82558139034792843, 124 | "y" : 0.75161290360794786 125 | }, 126 | "topRight" : { 127 | "y" : 0.69677419324689094, 128 | "x" : 0.82558139034792843 129 | }, 130 | "bottomLeft" : { 131 | "x" : 0.090116302457303524, 132 | "y" : 0.75161290360794786 133 | }, 134 | "topLeft" : { 135 | "y" : 0.69677419324689094, 136 | "x" : 0.090116302457303524 137 | } 138 | }, 139 | "confidence" : 1 140 | } 141 | ], 142 | "texts" : "The Llama 3.2-Vision collection of multimodal large language models (LLMS) is a\ncollection of instruction-tuned image reasoning generative models in lIb and 90B\nsizes (text + images in \/ text out). The Llama 3.2-Vision instruction-tuned models\nare optimized for visual recognition, image reasoning, captioning, and answering\ngeneral questions about an image. The models outperform many of the available\nopen source and closed multimodal models on common industry benchmarks." 143 | } -------------------------------------------------------------------------------- /output/handwriting.webp.json: -------------------------------------------------------------------------------- 1 | { 2 | "texts" : "The Llama 3.2-Vision Collection of multimodal large langyage model5 (LLMS) is a\ncollection of instruction-tuned image reasoning generative models in l1B and 90B\nsizes (text + images in \/ text ovt). The Llama 3.2-Vision instruction-tuned models\nare optimized for visval recognittion, iage reasoning, captioning, and answering\ngeneral qvestions about an iage. The models outperform many of the available\nopen Source and Closed multimodal models on common industry benchmarKs.", 3 | "info" : { 4 | "height" : 720, 5 | "filename" : "handwriting.webp", 6 | "filepath" : "\/Users\/abao\/Documents\/Github\/macos-vision-ocr\/.\/images\/handwriting.webp", 7 | "width" : 1600 8 | }, 9 | "observations" : [ 10 | { 11 | "confidence" : 0.5, 12 | "text" : "The Llama 3.2-Vision Collection of multimodal large langyage model5 (LLMS) is a", 13 | "quad" : { 14 | "bottomLeft" : { 15 | "y" : 0.35483871098527953, 16 | "x" : 0.09011629800287288 17 | }, 18 | "topRight" : { 19 | "x" : 0.87936045388666206, 20 | "y" : 0.28333333395755611 21 | }, 22 | "topLeft" : { 23 | "y" : 0.28333333395755611, 24 | "x" : 0.09011629800287288 25 | }, 26 | "bottomRight" : { 27 | "x" : 0.87936045388666206, 28 | "y" : 0.35483871098527953 29 | } 30 | } 31 | }, 32 | { 33 | "text" : "collection of instruction-tuned image reasoning generative models in l1B and 90B", 34 | "confidence" : 0.5, 35 | "quad" : { 36 | "bottomLeft" : { 37 | "x" : 0.090116268965468371, 38 | "y" : 0.43225806298465586 39 | }, 40 | "bottomRight" : { 41 | "y" : 0.43225806298465586, 42 | "x" : 0.88372092899964794 43 | }, 44 | "topLeft" : { 45 | "x" : 0.090116268965468371, 46 | "y" : 0.36774193719967596 47 | }, 48 | "topRight" : { 49 | "x" : 0.88372092899964794, 50 | "y" : 0.36774193719967596 51 | } 52 | } 53 | }, 54 | { 55 | "quad" : { 56 | "bottomRight" : { 57 | "x" : 0.90988372519304139, 58 | "y" : 0.51290322458603832 59 | }, 60 | "topRight" : { 61 | "y" : 0.44838709880105843, 62 | "x" : 0.90988372519304139 63 | }, 64 | "topLeft" : { 65 | "y" : 0.44838709880105843, 66 | "x" : 0.090116269138353855 67 | }, 68 | "bottomLeft" : { 69 | "x" : 0.090116269138353855, 70 | "y" : 0.51290322458603832 71 | } 72 | }, 73 | "text" : "sizes (text + images in \/ text ovt). The Llama 3.2-Vision instruction-tuned models", 74 | "confidence" : 0.5 75 | }, 76 | { 77 | "confidence" : 0.5, 78 | "text" : "are optimized for visval recognittion, iage reasoning, captioning, and answering", 79 | "quad" : { 80 | "topLeft" : { 81 | "x" : 0.090116283028365843, 82 | "y" : 0.52580645030504303 83 | }, 84 | "topRight" : { 85 | "x" : 0.87790696296000648, 86 | "y" : 0.52580645030504303 87 | }, 88 | "bottomLeft" : { 89 | "x" : 0.090116283028365843, 90 | "y" : 0.59722222168345418 91 | }, 92 | "bottomRight" : { 93 | "y" : 0.59722222168345418, 94 | "x" : 0.87790696296000648 95 | } 96 | } 97 | }, 98 | { 99 | "confidence" : 0.5, 100 | "text" : "general qvestions about an iage. The models outperform many of the available", 101 | "quad" : { 102 | "topLeft" : { 103 | "x" : 0.090116291199582074, 104 | "y" : 0.61250000048202458 105 | }, 106 | "bottomRight" : { 107 | "x" : 0.89534883514489449, 108 | "y" : 0.67777777825980234 109 | }, 110 | "topRight" : { 111 | "y" : 0.61250000048202458, 112 | "x" : 0.89534883514489449 113 | }, 114 | "bottomLeft" : { 115 | "y" : 0.67777777825980234, 116 | "x" : 0.090116291199582074 117 | } 118 | } 119 | }, 120 | { 121 | "confidence" : 0.5, 122 | "quad" : { 123 | "bottomLeft" : { 124 | "y" : 0.75161290360794786, 125 | "x" : 0.090116302457303524 126 | }, 127 | "topLeft" : { 128 | "y" : 0.69677419324689094, 129 | "x" : 0.090116302457303524 130 | }, 131 | "topRight" : { 132 | "x" : 0.82558139034792843, 133 | "y" : 0.69677419324689094 134 | }, 135 | "bottomRight" : { 136 | "y" : 0.75161290360794786, 137 | "x" : 0.82558139034792843 138 | } 139 | }, 140 | "text" : "open Source and Closed multimodal models on common industry benchmarKs." 141 | } 142 | ] 143 | } -------------------------------------------------------------------------------- /output/macos-vision-ocr.jpg.json: -------------------------------------------------------------------------------- 1 | { 2 | "texts" : "MacOS Vision OCR\nA powerful command-line OCR tool built with Apple's Vision framework, supporting single\nimage and batch processing with detailed positional information output.\nFeatures\n• Support for multiple image formats (JPG, JPEG, PNG, WEBP)\n|• Single image and batch processing modes\n• Multi-language recognition (Simplified Chinese, Traditional Chinese, English,\nJapanese)\n• Detailed JSON output with text positions and confidence scores\n• Debug mode with visual bounding boxes\n• Support for both arm64 and x86_64 architectures", 3 | "info" : { 4 | "width" : 1782, 5 | "filename" : "macos-vision-ocr.jpg", 6 | "filepath" : "\/Users\/abao\/Documents\/Github\/macos-vision-ocr\/.\/images\/macos-vision-ocr.jpg", 7 | "height" : 970 8 | }, 9 | "observations" : [ 10 | { 11 | "quad" : { 12 | "bottomRight" : { 13 | "x" : 0.40406976816030932, 14 | "y" : 0.090721649676073324 15 | }, 16 | "topLeft" : { 17 | "x" : 0.010174420916166525, 18 | "y" : 0.021333332843206487 19 | }, 20 | "bottomLeft" : { 21 | "y" : 0.090721649676073324, 22 | "x" : 0.010174420916166525 23 | }, 24 | "topRight" : { 25 | "x" : 0.40406976816030932, 26 | "y" : 0.021333332843206487 27 | } 28 | }, 29 | "confidence" : 0.5, 30 | "text" : "MacOS Vision OCR" 31 | }, 32 | { 33 | "quad" : { 34 | "bottomRight" : { 35 | "y" : 0.24536082440269502, 36 | "x" : 0.94476745096436332 37 | }, 38 | "topLeft" : { 39 | "y" : 0.19175257698001458, 40 | "x" : 0.0087209234057915377 41 | }, 42 | "bottomLeft" : { 43 | "x" : 0.0087209234057915377, 44 | "y" : 0.24536082440269502 45 | }, 46 | "topRight" : { 47 | "x" : 0.94476745096436332, 48 | "y" : 0.19175257698001458 49 | } 50 | }, 51 | "text" : "A powerful command-line OCR tool built with Apple's Vision framework, supporting single", 52 | "confidence" : 0.5 53 | }, 54 | { 55 | "confidence" : 0.5, 56 | "text" : "image and batch processing with detailed positional information output.", 57 | "quad" : { 58 | "topRight" : { 59 | "x" : 0.75872090677282367, 60 | "y" : 0.26082474276662926 61 | }, 62 | "bottomRight" : { 63 | "x" : 0.75872090677282367, 64 | "y" : 0.30666666572327284 65 | }, 66 | "bottomLeft" : { 67 | "y" : 0.30666666572327284, 68 | "x" : 0.0058139618899673484 69 | }, 70 | "topLeft" : { 71 | "x" : 0.0058139618899673484, 72 | "y" : 0.26082474276662926 73 | } 74 | } 75 | }, 76 | { 77 | "confidence" : 1, 78 | "quad" : { 79 | "topRight" : { 80 | "y" : 0.39999999921860963, 81 | "x" : 0.14680232577338936 82 | }, 83 | "bottomRight" : { 84 | "x" : 0.14680232577338936, 85 | "y" : 0.45066666672424749 86 | }, 87 | "topLeft" : { 88 | "x" : 0.008720928234085731, 89 | "y" : 0.39999999921860963 90 | }, 91 | "bottomLeft" : { 92 | "x" : 0.008720928234085731, 93 | "y" : 0.45066666672424749 94 | } 95 | }, 96 | "text" : "Features" 97 | }, 98 | { 99 | "confidence" : 0.5, 100 | "text" : "• Support for multiple image formats (JPG, JPEG, PNG, WEBP)", 101 | "quad" : { 102 | "bottomRight" : { 103 | "x" : 0.7093023117874665, 104 | "y" : 0.59733333293108926 105 | }, 106 | "bottomLeft" : { 107 | "x" : 0.050872103125324253, 108 | "y" : 0.59733333293108926 109 | }, 110 | "topRight" : { 111 | "x" : 0.7093023117874665, 112 | "y" : 0.54399999933557741 113 | }, 114 | "topLeft" : { 115 | "x" : 0.050872103125324253, 116 | "y" : 0.54399999933557741 117 | } 118 | } 119 | }, 120 | { 121 | "text" : "|• Single image and batch processing modes", 122 | "confidence" : 0.5, 123 | "quad" : { 124 | "bottomLeft" : { 125 | "x" : 0.021802331830222881, 126 | "y" : 0.65599999999726333 127 | }, 128 | "topLeft" : { 129 | "x" : 0.021802331830222881, 130 | "y" : 0.60515463948713533 131 | }, 132 | "bottomRight" : { 133 | "x" : 0.5174418460040795, 134 | "y" : 0.65599999999726333 135 | }, 136 | "topRight" : { 137 | "y" : 0.60515463948713533, 138 | "x" : 0.5174418460040795 139 | } 140 | } 141 | }, 142 | { 143 | "text" : "• Multi-language recognition (Simplified Chinese, Traditional Chinese, English,", 144 | "quad" : { 145 | "topLeft" : { 146 | "x" : 0.049418620281314572, 147 | "y" : 0.67731958853215035 148 | }, 149 | "bottomLeft" : { 150 | "x" : 0.049418620281314572, 151 | "y" : 0.7309278359548308 152 | }, 153 | "topRight" : { 154 | "x" : 0.87209299587317068, 155 | "y" : 0.67731958853215035 156 | }, 157 | "bottomRight" : { 158 | "x" : 0.87209299587317068, 159 | "y" : 0.7309278359548308 160 | } 161 | }, 162 | "confidence" : 0.5 163 | }, 164 | { 165 | "text" : "Japanese)", 166 | "confidence" : 1, 167 | "quad" : { 168 | "topLeft" : { 169 | "y" : 0.74594523907006915, 170 | "x" : 0.072753072485975179 171 | }, 172 | "topRight" : { 173 | "y" : 0.74738910395144698, 174 | "x" : 0.18321579209608452 175 | }, 176 | "bottomRight" : { 177 | "y" : 0.78738809443310875, 178 | "x" : 0.18306087942362423 179 | }, 180 | "bottomLeft" : { 181 | "y" : 0.78594422955173093, 182 | "x" : 0.072598159813514904 183 | } 184 | } 185 | }, 186 | { 187 | "quad" : { 188 | "bottomLeft" : { 189 | "x" : 0.050872088433827421, 190 | "y" : 0.85360824835628779 191 | }, 192 | "topRight" : { 193 | "y" : 0.81030927928412277, 194 | "x" : 0.74273257425997075 195 | }, 196 | "bottomRight" : { 197 | "x" : 0.74273257425997075, 198 | "y" : 0.85360824835628779 199 | }, 200 | "topLeft" : { 201 | "x" : 0.050872088433827421, 202 | "y" : 0.81030927928412277 203 | } 204 | }, 205 | "confidence" : 0.5, 206 | "text" : "• Detailed JSON output with text positions and confidence scores" 207 | }, 208 | { 209 | "text" : "• Debug mode with visual bounding boxes", 210 | "confidence" : 0.5, 211 | "quad" : { 212 | "topLeft" : { 213 | "y" : 0.87731958773997709, 214 | "x" : 0.049418605301475543 215 | }, 216 | "bottomRight" : { 217 | "x" : 0.49709301514426102, 218 | "y" : 0.92268041248224508 219 | }, 220 | "bottomLeft" : { 221 | "y" : 0.92268041248224508, 222 | "x" : 0.049418605301475543 223 | }, 224 | "topRight" : { 225 | "x" : 0.49709301514426102, 226 | "y" : 0.87731958773997709 227 | } 228 | } 229 | }, 230 | { 231 | "text" : "• Support for both arm64 and x86_64 architectures", 232 | "quad" : { 233 | "topLeft" : { 234 | "x" : 0.049418607200380195, 235 | "y" : 0.94400000135845674 236 | }, 237 | "bottomLeft" : { 238 | "x" : 0.049418607200380195, 239 | "y" : 0.99199999884154311 240 | }, 241 | "bottomRight" : { 242 | "y" : 0.99199999884154311, 243 | "x" : 0.59738371153145142 244 | }, 245 | "topRight" : { 246 | "y" : 0.94400000135845674, 247 | "x" : 0.59738371153145142 248 | } 249 | }, 250 | "confidence" : 0.5 251 | } 252 | ] 253 | } -------------------------------------------------------------------------------- /output/merged_output.txt: -------------------------------------------------------------------------------- 1 | The Llama 3.2-Vision Collection of multimodal large langyage model5 (LLMS) is a 2 | collection of instruction-tuned image reasoning generative models in l1B and 90B 3 | sizes (text + images in / text ovt). The Llama 3.2-Vision instruction-tuned models 4 | are optimized for visval recognittion, iage reasoning, captioning, and answering 5 | general qvestions about an iage. The models outperform many of the available 6 | open Source and Closed multimodal models on common industry benchmarKs. 7 | 8 | MacOS Vision OCR 9 | A powerful command-line OCR tool built with Apple's Vision framework, supporting single 10 | image and batch processing with detailed positional information output. 11 | Features 12 | • Support for multiple image formats (JPG, JPEG, PNG, WEBP) 13 | |• Single image and batch processing modes 14 | • Multi-language recognition (Simplified Chinese, Traditional Chinese, English, 15 | Japanese) 16 | • Detailed JSON output with text positions and confidence scores 17 | • Debug mode with visual bounding boxes 18 | • Support for both arm64 and x86_64 architectures 19 | 20 | --------------------------------------------------------------------------------