├── Applications ├── AlpacaChatApp │ ├── Resources │ │ ├── .gitignore │ │ └── Assets.xcassets │ │ │ ├── Contents.json │ │ │ ├── AppIcon.appiconset │ │ │ ├── Icon.png │ │ │ └── Contents.json │ │ │ └── AccentColor.colorset │ │ │ └── Contents.json │ ├── Configurations │ │ ├── .gitignore │ │ └── AlpacaChatApp.xcconfig │ ├── Preview Content │ │ └── Preview Assets.xcassets │ │ │ └── Contents.json │ ├── Sources │ │ ├── String.swift │ │ ├── AlpacaChatApp.swift │ │ ├── Message.swift │ │ ├── ChatView.swift │ │ ├── MessageView.swift │ │ └── ChatViewModel.swift │ └── Supporting Files │ │ └── AlpacaChatApp.entitlements ├── AlpacaChatCLI │ ├── .gitignore │ ├── Package.resolved │ ├── Package.swift │ └── Sources │ │ └── AlpacaChatCLI │ │ └── Command.swift └── AlpacaChatApp.xcodeproj │ ├── project.xcworkspace │ ├── contents.xcworkspacedata │ └── xcshareddata │ │ └── IDEWorkspaceChecks.plist │ ├── xcshareddata │ └── xcschemes │ │ └── AlpacaChatApp.xcscheme │ └── project.pbxproj ├── Resources └── AlpacaChat.png ├── .gitignore ├── Sources ├── alpaca.cpp │ ├── README.md │ ├── LICENSE │ ├── include │ │ ├── chat.h │ │ ├── utils.h │ │ └── ggml.h │ ├── utils.cpp │ └── chat.cpp ├── AlpacaChat │ ├── Model.swift │ └── Chat.swift └── AlpacaChatObjC │ ├── include │ ├── ALPChatModel.h │ └── ALPChat.h │ └── ALPChat.mm ├── Package.swift ├── LICENSE └── README.md /Applications/AlpacaChatApp/Resources/.gitignore: -------------------------------------------------------------------------------- 1 | *.bin 2 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Configurations/.gitignore: -------------------------------------------------------------------------------- 1 | Local.xcconfig 2 | -------------------------------------------------------------------------------- /Resources/AlpacaChat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niw/AlpacaChat/HEAD/Resources/AlpacaChat.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .swiftpm/config/registries.json 3 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 4 | xcuserdata/ 5 | /.build 6 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Resources/Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Preview Content/Preview Assets.xcassets/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "info" : { 3 | "author" : "xcode", 4 | "version" : 1 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /Applications/AlpacaChatCLI/.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .swiftpm/config/registries.json 3 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata 4 | xcuserdata/ 5 | /.build 6 | -------------------------------------------------------------------------------- /Sources/alpaca.cpp/README.md: -------------------------------------------------------------------------------- 1 | Alpaca.cpp 2 | ========== 3 | 4 | See and 5 | original for details. 6 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Resources/Assets.xcassets/AppIcon.appiconset/Icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/niw/AlpacaChat/HEAD/Applications/AlpacaChatApp/Resources/Assets.xcassets/AppIcon.appiconset/Icon.png -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Sources/String.swift: -------------------------------------------------------------------------------- 1 | // 2 | // String.swift 3 | // AlpacaChatApp 4 | // 5 | // Created by Yoshimasa Niwa on 3/26/23. 6 | // 7 | 8 | import Foundation 9 | 10 | extension String: Error { 11 | } 12 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp.xcodeproj/project.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | IDEDidComputeMac32BitWarning 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "filename" : "Icon.png", 5 | "idiom" : "universal", 6 | "platform" : "ios", 7 | "size" : "1024x1024" 8 | } 9 | ], 10 | "info" : { 11 | "author" : "xcode", 12 | "version" : 1 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Sources/AlpacaChatApp.swift: -------------------------------------------------------------------------------- 1 | // 2 | // AlpacaChatApp.swift 3 | // AlpacaChatApp 4 | // 5 | // Created by Yoshimasa Niwa on 3/18/23. 6 | // 7 | 8 | import SwiftUI 9 | 10 | @main 11 | struct AlpacaChatApp: App { 12 | var body: some Scene { 13 | WindowGroup { 14 | ChatView() 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Resources/Assets.xcassets/AccentColor.colorset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "colors" : [ 3 | { 4 | "color" : { 5 | "platform" : "ios", 6 | "reference" : "systemOrangeColor" 7 | }, 8 | "idiom" : "universal" 9 | } 10 | ], 11 | "info" : { 12 | "author" : "xcode", 13 | "version" : 1 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /Applications/AlpacaChatCLI/Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "pins" : [ 3 | { 4 | "identity" : "swift-argument-parser", 5 | "kind" : "remoteSourceControl", 6 | "location" : "https://github.com/apple/swift-argument-parser", 7 | "state" : { 8 | "revision" : "fee6933f37fde9a5e12a1e4aeaa93fe60116ff2a", 9 | "version" : "1.2.2" 10 | } 11 | } 12 | ], 13 | "version" : 2 14 | } 15 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Supporting Files/AlpacaChatApp.entitlements: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | com.apple.developer.kernel.extended-virtual-addressing 6 | 7 | com.apple.developer.kernel.increased-memory-limit 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /Sources/AlpacaChat/Model.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Model.swift 3 | // AlpacaChat 4 | // 5 | // Created by Yoshimasa Niwa on 3/18/23. 6 | // 7 | 8 | import Foundation 9 | import AlpacaChatObjC 10 | 11 | public struct Model { 12 | var model: ALPChatModel 13 | 14 | @available(macOS 10.15, iOS 13.0, watchOS 6.0, tvOS 13.0, *) 15 | public static func load(from url: URL, contextSize: Int32 = 512, isLowMemory: Bool = false) async throws -> Self { 16 | let model = try ALPChatModel.load(from: url, contextSize: contextSize, isLowMemory: isLowMemory) 17 | return Model(model: model) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Sources/Message.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Message.swift 3 | // AlpacaChatApp 4 | // 5 | // Created by Yoshimasa Niwa on 3/20/23. 6 | // 7 | 8 | import Foundation 9 | 10 | struct Message: Identifiable { 11 | enum State { 12 | case none 13 | case error 14 | case typed 15 | case predicting 16 | case predicted(tokensPerSeconds: Double) 17 | } 18 | 19 | enum Sender { 20 | case user 21 | case system 22 | } 23 | 24 | var id = UUID() 25 | var sender: Sender 26 | var state: State = .none 27 | var text: String 28 | } 29 | -------------------------------------------------------------------------------- /Sources/AlpacaChat/Chat.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Chat.swift 3 | // AlpacaChat 4 | // 5 | // Created by Yoshimasa Niwa on 3/18/23. 6 | // 7 | 8 | import Foundation 9 | import AlpacaChatObjC 10 | 11 | public final class Chat { 12 | private let chat: ALPChat 13 | 14 | public init(model: Model) { 15 | chat = ALPChat(model: model.model) 16 | } 17 | 18 | @available(macOS 10.15, iOS 13.0, watchOS 6.0, tvOS 13.0, *) 19 | public func predictTokens(for prompt: String) -> AsyncThrowingStream { 20 | AsyncThrowingStream { continuation in 21 | chat.predictTokens(for: prompt) { token in 22 | continuation.yield(token) 23 | } completionHandler: { error in 24 | continuation.finish(throwing: error) 25 | } 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /Sources/AlpacaChatObjC/include/ALPChatModel.h: -------------------------------------------------------------------------------- 1 | // 2 | // ALPChatModel.h 3 | // AlpacaChatObjC 4 | // 5 | // Created by Yoshimasa Niwa on 3/16/23. 6 | // 7 | 8 | #import 9 | 10 | NS_ASSUME_NONNULL_BEGIN 11 | 12 | FOUNDATION_EXPORT NSString * const ALPChatModelErrorDomain; 13 | 14 | NS_ENUM(NSUInteger, ALPChatModelErrorCode) { 15 | ALPChatModelErrorCodeUnknown = 0, 16 | ALPChatModelErrorCodeFailedToLoad 17 | }; 18 | 19 | @interface ALPChatModel : NSObject 20 | 21 | + (nullable ALPChatModel *)loadFromURL:(NSURL *)URL 22 | contextSize:(int)contextSize 23 | isLowMemory:(BOOL)isLowMemory 24 | error:(NSError * _Nullable * _Nullable)error; 25 | 26 | - (instancetype)init NS_UNAVAILABLE; 27 | + (instancetype)new NS_UNAVAILABLE; 28 | 29 | @end 30 | 31 | NS_ASSUME_NONNULL_END 32 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Configurations/AlpacaChatApp.xcconfig: -------------------------------------------------------------------------------- 1 | // 2 | // AlpacaChatApp.xcconfig 3 | // AlpacaChatApp 4 | // 5 | // Created by Yoshimasa Niwa on 3/18/23. 6 | // 7 | 8 | // Place `Local.xcconfig` file which is git ignored in this `Configurations` directory 9 | // can supply local specific configurations such as code signing identity. 10 | // For example, put next content as `Local.xcconfig`. 11 | // ``` 12 | // CODE_SIGN_STYLE = Manual 13 | // CODE_SIGN_IDENTITY = iPhone Developer 14 | // DEVELOPMENT_TEAM = $(YOUR_DEVELOPMENT_TEAM_ID) 15 | // PRODUCT_BUNDLE_IDENTIFIER = $(YOUR_APP_BUNDLE_IDENTIFIER) 16 | // PROVISIONING_PROFILE_SPECIFIER = $(YOUR_PROFILE_NAME) 17 | // ``` 18 | 19 | // Default values, you can override it in `Local.xcconfig`. 20 | CODE_SIGN_STYLE = Manual 21 | CODE_SIGN_IDENTITY = iPhone Developer 22 | PRODUCT_BUNDLE_IDENTIFIER = at.niw.AlpacaChatApp 23 | 24 | #include? "Local.xcconfig" 25 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.7 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "AlpacaChat", 8 | products: [ 9 | .library( 10 | name: "AlpacaChat", 11 | targets: [ 12 | "AlpacaChat", 13 | "AlpacaChatObjC" 14 | ] 15 | ) 16 | ], 17 | targets: [ 18 | .target( 19 | name: "AlpacaChat", 20 | dependencies: [ 21 | .target(name: "AlpacaChatObjC") 22 | ] 23 | ), 24 | .target( 25 | name: "AlpacaChatObjC", 26 | dependencies: [ 27 | .target(name: "alpaca.cpp") 28 | ] 29 | ), 30 | .target( 31 | name: "alpaca.cpp" 32 | ) 33 | ], 34 | cxxLanguageStandard: .cxx11 35 | ) 36 | -------------------------------------------------------------------------------- /Applications/AlpacaChatCLI/Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 5.7 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "AlpacaChatCLI", 8 | platforms: [ 9 | .macOS(.v10_15) 10 | ], 11 | products: [ 12 | .executable( 13 | name: "AlpacaChatCLI", 14 | targets: [ 15 | "AlpacaChatCLI" 16 | ] 17 | ) 18 | ], 19 | dependencies: [ 20 | .package(name: "AlpacaChat", path: "../.."), 21 | .package(url: "https://github.com/apple/swift-argument-parser", .upToNextMajor(from: "1.2.2")) 22 | ], 23 | targets: [ 24 | .executableTarget( 25 | name: "AlpacaChatCLI", 26 | dependencies: [ 27 | .product(name: "ArgumentParser", package: "swift-argument-parser"), 28 | .product(name: "AlpacaChat", package: "AlpacaChat") 29 | ] 30 | ), 31 | ] 32 | ) 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023 Yoshimasa Niwa 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /Sources/AlpacaChatObjC/include/ALPChat.h: -------------------------------------------------------------------------------- 1 | // 2 | // ALPChat.h 3 | // AlpacaChatObjC 4 | // 5 | // Created by Yoshimasa Niwa on 3/16/23. 6 | // 7 | 8 | #import 9 | 10 | NS_ASSUME_NONNULL_BEGIN 11 | 12 | @class ALPChatModel; 13 | 14 | FOUNDATION_EXPORT NSString * const ALPChatErrorDomain; 15 | 16 | NS_ENUM(NSUInteger, ALPChatErrorCode) { 17 | ALPChatErrorCodeUnknown = 0, 18 | ALPChatErrorCodeCancelled, 19 | ALPChatErrorCodeFailedToPredict, 20 | ALPChatErrorCodeNoRemainingTokens, 21 | }; 22 | 23 | @protocol ALPChatCancellable 24 | 25 | - (void)cancel; 26 | 27 | @end 28 | 29 | @interface ALPChat : NSObject 30 | 31 | - (instancetype)initWithModel:(ALPChatModel *)model NS_DESIGNATED_INITIALIZER; 32 | - (instancetype)init NS_UNAVAILABLE; 33 | + (instancetype)new NS_UNAVAILABLE; 34 | 35 | - (id)predictTokensForPrompt:(NSString *)prompt 36 | tokenHandler:(nullable void (^)(NSString *token))tokenHandler 37 | completionHandler:(nullable void (^)(NSError * _Nullable error))completionHandler 38 | NS_SWIFT_NAME(predictTokens(for:tokenHandler:completionHandler:)); 39 | 40 | @end 41 | 42 | NS_ASSUME_NONNULL_END 43 | -------------------------------------------------------------------------------- /Applications/AlpacaChatCLI/Sources/AlpacaChatCLI/Command.swift: -------------------------------------------------------------------------------- 1 | // 2 | // Command.swift 3 | // AlpacaChatCLI 4 | // 5 | // Created by Yoshimasa Niwa on 3/18/23. 6 | // 7 | 8 | import Foundation 9 | import AlpacaChat 10 | import ArgumentParser 11 | import Darwin 12 | 13 | @main 14 | struct Command: AsyncParsableCommand { 15 | @Option(name: .shortAndLong, help: "Path to model file.") 16 | var modelPath: String 17 | @Option(name: .shortAndLong, help: "Context size.") 18 | var contextSize: Int32 = 2048 19 | @Flag(name: .shortAndLong, help: "Use low memory model loading.") 20 | var lowMemory: Bool = false 21 | 22 | mutating func run() async throws { 23 | let modelURL = URL(fileURLWithPath: modelPath) 24 | let model = try await Model.load(from: modelURL, contextSize: contextSize, isLowMemory: lowMemory) 25 | let chat = Chat(model: model) 26 | 27 | while true { 28 | print("> ", terminator: "") 29 | guard let prompt = readLine() else { 30 | break 31 | } 32 | guard !prompt.isEmpty else { 33 | continue 34 | } 35 | 36 | for try await token in chat.predictTokens(for: prompt) { 37 | print(token, terminator: "") 38 | fflush(stdout) 39 | } 40 | print("") 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /Sources/alpaca.cpp/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Georgi Gerganov 4 | https://github.com/ggerganov/llama.cpp 5 | 6 | Copyright (c) 2023 Kevin Kwok 7 | https://github.com/antimatter15/alpaca.cpp 8 | 9 | Copyright (c) 2023 Caize Wu 10 | https://github.com/Zepan/llama.cpp/commit/03ba421c74109b5bff297b207a1b47f8cc6fc05e 11 | 12 | Permission is hereby granted, free of charge, to any person obtaining a copy 13 | of this software and associated documentation files (the "Software"), to deal 14 | in the Software without restriction, including without limitation the rights 15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | copies of the Software, and to permit persons to whom the Software is 17 | furnished to do so, subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be included in all 20 | copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 28 | SOFTWARE. 29 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Sources/ChatView.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ChatView.swift 3 | // AlpacaChatApp 4 | // 5 | // Created by Yoshimasa Niwa on 3/18/23. 6 | // 7 | 8 | import SwiftUI 9 | 10 | struct ChatView: View { 11 | @StateObject 12 | private var viewModel = ChatViewModel() 13 | 14 | @State 15 | private var inputText: String = "" 16 | 17 | var body: some View { 18 | VStack { 19 | List { 20 | ForEach(viewModel.messages) { message in 21 | MessageView(message: message) 22 | } 23 | .listRowSeparator(.hidden) 24 | } 25 | HStack { 26 | switch viewModel.state { 27 | case .none, .loading: 28 | ProgressView { 29 | Text("Loading...") 30 | } 31 | case .completed: 32 | TextField("Type your message...", text: $inputText) 33 | .textFieldStyle(RoundedBorderTextFieldStyle()) 34 | Button { 35 | Task { 36 | let text = inputText 37 | inputText = "" 38 | await viewModel.send(message: text) 39 | } 40 | } label: { 41 | Image(systemName: "arrow.up.circle.fill") 42 | } 43 | .padding(.horizontal, 6.0) 44 | .disabled(inputText.isEmpty) 45 | } 46 | } 47 | .padding(.all) 48 | } 49 | .navigationTitle("Chat") 50 | .task { 51 | await viewModel.prepare() 52 | } 53 | } 54 | } 55 | 56 | struct ChatView_Previews: PreviewProvider { 57 | static var previews: some View { 58 | ChatView() 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | AlpacaChat 2 | ========== 3 | 4 | A Swift library that runs Alpaca-LoRA prediction locally 5 | to implement ChatGPT like app on Apple platform devices. 6 | 7 | ![AlpacaChat](Resources/AlpacaChat.png) 8 | 9 | It is basically a wrapper for [alpaca.cpp](https://github.com/antimatter15/alpaca.cpp) 10 | that provides a simple Swift API for it. 11 | 12 | ```swift 13 | import AlpacaChat 14 | 15 | // Load model and instantiate a chat. 16 | let model = try await Model.load(from: URL(fileURLWithPath: "model.bin")) 17 | let chat = Chat(model: model) 18 | 19 | // Ask users to get prompt. 20 | let prompt = readLine()! 21 | 22 | // Run prediction and print tokens. 23 | for try await token in chat.predictTokens(for: prompt) { 24 | print(token) 25 | } 26 | ``` 27 | 28 | 29 | Model 30 | ----- 31 | 32 | Read [alpaca.cpp](https://github.com/antimatter15/alpaca.cpp), 33 | [alpaca-lora](https://github.com/tloen/alpaca-lora), and 34 | [llma.cpp](https://github.com/ggerganov/llama.cpp), 35 | then create 4-bits quantized `ggml` model bin file. 36 | 37 | Place it in `/Applications/AlpacaChatApp/Resouces/model.bin` for example, 38 | and build app and run it. 39 | 40 | 41 | Usage 42 | ----- 43 | 44 | See actual command line and SwiftUI application for usages. 45 | 46 | 47 | Applications 48 | ------------ 49 | 50 | ### `/Applications/AlpacaChatCLI` 51 | 52 | A command line chat app that can run on macOS. 53 | 54 | To build, use Xcode or simply use `swift` command. 55 | 56 | ``` 57 | $ cd Applications/AlpacaChatCLI 58 | $ swift build -c release 59 | $ .build/release/AlpacaChatCLI -m /path/to/model.bin 60 | ``` 61 | 62 | ### `/Applications/AlpacaChatApp.xcodeproj` 63 | 64 | A SwiftUI chat app that can run on iOS devices. 65 | 66 | To build app runs on actual device, you need to create your own AppID 67 | and provisioning profile that allows extended memory usage with 68 | an entitlement. 69 | 70 | Place `/Applications/AlpacaChatApp/Configurations/Local.xcconfig` 71 | to provide these your local development configurations for signing. 72 | 73 | You may want to change scheme to use Release configuration for Run, 74 | or it may be seriously slow. 75 | -------------------------------------------------------------------------------- /Sources/alpaca.cpp/include/chat.h: -------------------------------------------------------------------------------- 1 | // 2 | // chat.h 3 | // alpaca.cpp 4 | // 5 | // Created by Yoshimasa Niwa on 3/16/23. 6 | // 7 | 8 | #pragma once 9 | 10 | #include 11 | #include 12 | 13 | #include 14 | #include 15 | #include 16 | 17 | // default hparams (LLaMA 7B) 18 | struct llama_hparams { 19 | int32_t n_vocab = 32000; 20 | int32_t n_ctx = 512; // this is provided as user input? 21 | int32_t n_embd = 4096; 22 | int32_t n_mult = 256; 23 | int32_t n_head = 32; 24 | int32_t n_layer = 32; 25 | int32_t n_rot = 64; 26 | int32_t f16 = 1; 27 | }; 28 | 29 | struct llama_layer { 30 | // normalization 31 | struct ggml_tensor * attention_norm; 32 | 33 | // attention 34 | struct ggml_tensor * wq; 35 | struct ggml_tensor * wk; 36 | struct ggml_tensor * wv; 37 | struct ggml_tensor * wo; 38 | 39 | // normalization 40 | struct ggml_tensor * ffn_norm; 41 | 42 | // ff 43 | struct ggml_tensor * w1; 44 | struct ggml_tensor * w2; 45 | struct ggml_tensor * w3; 46 | }; 47 | 48 | struct mbuf_t { 49 | mbuf_t(): buf(nullptr), size(0), p(nullptr), oft(0) {}; 50 | 51 | char* buf; 52 | size_t size; 53 | char* p; 54 | size_t oft; 55 | }; 56 | 57 | struct llama_model { 58 | llama_hparams hparams; 59 | 60 | struct ggml_tensor * tok_embeddings; 61 | 62 | struct ggml_tensor * norm; 63 | struct ggml_tensor * output; 64 | 65 | std::vector layers; 66 | 67 | // key + value memory 68 | struct ggml_tensor * memory_k; 69 | struct ggml_tensor * memory_v; 70 | 71 | // 72 | struct ggml_context * ctx; 73 | std::map tensors; 74 | 75 | mbuf_t mbuf; 76 | }; 77 | 78 | void llma_model_unload(llama_model &model); 79 | 80 | bool llama_model_load(const std::string &fname, 81 | llama_model &model, 82 | gpt_vocab &vocab, 83 | int n_ctx); 84 | 85 | bool llama_model_load_lowmem(const std::string &fname, 86 | llama_model &model, 87 | gpt_vocab &vocab, 88 | int n_ctx); 89 | 90 | bool llama_eval(const llama_model &model, 91 | const int n_threads, 92 | const int n_past, 93 | const std::vector &embd_inp, 94 | std::vector &embd_w, 95 | size_t &mem_per_token); 96 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Sources/MessageView.swift: -------------------------------------------------------------------------------- 1 | // 2 | // MessageView.swift 3 | // AlpacaChatApp 4 | // 5 | // Created by Yoshimasa Niwa on 3/20/23. 6 | // 7 | 8 | import SwiftUI 9 | 10 | struct MessageView: View { 11 | var message: Message 12 | 13 | private struct SenderView: View { 14 | var sender: Message.Sender 15 | 16 | var body: some View { 17 | switch sender { 18 | case .user: 19 | Text("You") 20 | .font(.caption) 21 | .foregroundColor(.accentColor) 22 | case .system: 23 | Text("Alpaca") 24 | .font(.caption) 25 | .foregroundColor(.accentColor) 26 | } 27 | } 28 | } 29 | 30 | private struct MessageContentView: View { 31 | var message: Message 32 | 33 | var body: some View { 34 | switch message.state { 35 | case .none: 36 | ProgressView() 37 | case .error: 38 | Text(message.text) 39 | .foregroundColor(Color.red) 40 | case .typed: 41 | Text(message.text) 42 | case .predicting: 43 | HStack { 44 | Text(message.text) 45 | ProgressView() 46 | .padding(.leading, 3.0) 47 | } 48 | case .predicted(tokensPerSeconds: let tokenPerSeconds): 49 | VStack(alignment: .leading) { 50 | Text(message.text) 51 | Text(String(format: "%.2f tokens/s", tokenPerSeconds)) 52 | .font(.footnote) 53 | .foregroundColor(Color.gray) 54 | } 55 | } 56 | } 57 | } 58 | 59 | var body: some View { 60 | HStack { 61 | if message.sender == .user { 62 | Spacer() 63 | } 64 | 65 | VStack(alignment: .leading, spacing: 6.0) { 66 | SenderView(sender: message.sender) 67 | MessageContentView(message: message) 68 | .padding(12.0) 69 | .background(Color.secondary.opacity(0.2)) 70 | .cornerRadius(12.0) 71 | } 72 | 73 | if message.sender == .system { 74 | Spacer() 75 | } 76 | } 77 | } 78 | } 79 | 80 | struct MessageView_Previews: PreviewProvider { 81 | static var previews: some View { 82 | VStack { 83 | MessageView(message: Message(sender: .user, state: .none, text: "none")) 84 | MessageView(message: Message(sender: .user, state: .error, text: "error")) 85 | MessageView(message: Message(sender: .user, state: .predicting, text: "predicting")) 86 | MessageView(message: Message(sender: .user, state: .predicted(tokensPerSeconds: 3.1415), text: "predicted")) 87 | } 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp.xcodeproj/xcshareddata/xcschemes/AlpacaChatApp.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 8 | 9 | 15 | 21 | 22 | 23 | 24 | 25 | 31 | 32 | 42 | 44 | 50 | 51 | 52 | 53 | 59 | 61 | 67 | 68 | 69 | 70 | 72 | 73 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp/Sources/ChatViewModel.swift: -------------------------------------------------------------------------------- 1 | // 2 | // ChatViewModel.swift 3 | // AlpacaChatApp 4 | // 5 | // Created by Yoshimasa Niwa on 3/19/23. 6 | // 7 | 8 | import AlpacaChat 9 | import Foundation 10 | import os 11 | 12 | private extension Duration { 13 | var seconds: Double { 14 | Double(components.seconds) + Double(components.attoseconds) / 1.0e18 15 | } 16 | } 17 | 18 | @MainActor 19 | final class ChatViewModel: ObservableObject { 20 | enum State { 21 | case none 22 | case loading 23 | case completed 24 | } 25 | 26 | private var chat: Chat? 27 | 28 | @Published 29 | var state: State = .none 30 | 31 | @Published 32 | var messages: [Message] = [] 33 | 34 | func prepare() async { 35 | guard chat == nil else { 36 | return 37 | } 38 | 39 | do { 40 | state = .loading 41 | guard let modelURL = Bundle.main.url(forResource: "model", withExtension: "bin") else { 42 | throw "Model not found." 43 | } 44 | 45 | let contextSize: Int32 46 | let isLowMemory: Bool 47 | #if targetEnvironment(simulator) 48 | contextSize = 2048 49 | isLowMemory = false 50 | #else 51 | let memorySize = os_proc_available_memory() 52 | if memorySize > 6 * 1024 * 1024 * 1024 { 53 | contextSize = 2048 54 | isLowMemory = false 55 | } else { 56 | contextSize = 512 57 | isLowMemory = true 58 | } 59 | #endif 60 | let model = try await Model.load(from: modelURL, contextSize: contextSize, isLowMemory: isLowMemory) 61 | chat = Chat(model: model) 62 | } catch { 63 | let message = Message(sender: .system, text: "Failed to load model.") 64 | messages.append(message) 65 | } 66 | state = .completed 67 | } 68 | 69 | func send(message text: String) async { 70 | let requestMessage = Message(sender: .user, state: .typed, text: text) 71 | messages.append(requestMessage) 72 | 73 | guard let chat = chat else { 74 | let message = Message(sender: .system, state: .error, text: "Chat is unavailable.") 75 | messages.append(message) 76 | return 77 | } 78 | 79 | do { 80 | var message = Message(sender: .system, text: "") 81 | messages.append(message) 82 | let messageIndex = messages.endIndex - 1 83 | 84 | var numberOfTokens = 0 85 | let duration = try await ContinuousClock().measure { 86 | for try await token in chat.predictTokens(for: text) { 87 | message.state = .predicting 88 | message.text += token 89 | 90 | var updatedMessages = messages 91 | updatedMessages[messageIndex] = message 92 | messages = updatedMessages 93 | 94 | numberOfTokens += 1 95 | } 96 | } 97 | message.state = .predicted(tokensPerSeconds: Double(numberOfTokens) / duration.seconds) 98 | messages[messageIndex] = message 99 | } catch { 100 | let message = Message(sender: .system, state: .error, text: error.localizedDescription) 101 | messages.append(message) 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /Sources/alpaca.cpp/include/utils.h: -------------------------------------------------------------------------------- 1 | // Various helper functions and utilities 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // 12 | // CLI argument parsing 13 | // 14 | 15 | struct gpt_params { 16 | int32_t seed = -1; // RNG seed 17 | int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); 18 | int32_t n_predict = 128; // new tokens to predict 19 | int32_t repeat_last_n = 64; // last n tokens to penalize 20 | int32_t n_ctx = 512; //context size 21 | 22 | // sampling parameters 23 | int32_t top_k = 40; 24 | float top_p = 0.95f; 25 | float temp = 0.80f; 26 | float repeat_penalty = 1.30f; 27 | 28 | int32_t n_batch = 8; // batch size for prompt processing 29 | 30 | std::string model = "models/lamma-7B/ggml-model.bin"; // model path 31 | std::string prompt; 32 | 33 | bool use_color = false; // use color to distinguish generations and inputs 34 | 35 | bool interactive = false; // interactive mode 36 | bool interactive_start = false; // reverse prompt immediately 37 | std::string antiprompt = ""; // string upon seeing which more user input is prompted 38 | }; 39 | 40 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params); 41 | 42 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params); 43 | 44 | std::string gpt_random_prompt(std::mt19937 & rng); 45 | 46 | // 47 | // Vocab utils 48 | // 49 | 50 | struct gpt_vocab { 51 | using id = int32_t; 52 | using token = std::string; 53 | 54 | std::map token_to_id; 55 | std::map id_to_token; 56 | }; 57 | 58 | void replace(std::string & str, const std::string & needle, const std::string & replacement); 59 | 60 | // poor-man's JSON parsing 61 | std::map json_parse(const std::string & fname); 62 | 63 | // split text into tokens 64 | // 65 | // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53 66 | // 67 | // Regex (Python): 68 | // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" 69 | // 70 | // Regex (C++): 71 | // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)" 72 | // 73 | std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text); 74 | 75 | // TODO: this is probably wrong, but I cannot figure out how this tokenizer works .. 76 | // ref: https://github.com/google/sentencepiece 77 | std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos); 78 | 79 | // load the tokens from encoder.json 80 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab); 81 | 82 | // sample next token given probabilities for each embedding 83 | // 84 | // - consider only the top K tokens 85 | // - from them, consider only the top tokens with cumulative probability > P 86 | // 87 | gpt_vocab::id llama_sample_top_p_top_k( 88 | const gpt_vocab & vocab, 89 | const float * logits, 90 | std::vector & last_n_tokens, 91 | double repeat_penalty, 92 | int top_k, 93 | double top_p, 94 | double temp, 95 | std::mt19937 & rng); 96 | 97 | // filer to top K tokens from list of logits 98 | void sample_top_k(std::vector> & logits_id, int top_k); 99 | 100 | // 101 | // Quantization 102 | // 103 | 104 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist); 105 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist); 106 | -------------------------------------------------------------------------------- /Sources/AlpacaChatObjC/ALPChat.mm: -------------------------------------------------------------------------------- 1 | // 2 | // ALPChat.mm 3 | // AlpacaChatObjC 4 | // 5 | // Created by Yoshimasa Niwa on 3/16/23. 6 | // 7 | 8 | #import "ALPChat.h" 9 | #import "ALPChatModel.h" 10 | 11 | #include 12 | 13 | #include 14 | #include 15 | 16 | NSString * const ALPChatModelErrorDomain = @"ALPChatModelErrorDomain"; 17 | 18 | @implementation ALPChatModel 19 | { 20 | @public 21 | llama_model _model; 22 | gpt_vocab _vocab; 23 | } 24 | 25 | + (ALPChatModel *)loadFromURL:(NSURL *)URL 26 | contextSize:(int)contextSize 27 | isLowMemory:(BOOL)isLowMemory 28 | error:(NSError **)error 29 | { 30 | gpt_vocab vocab; 31 | llama_model model; 32 | 33 | bool success = false; 34 | if (isLowMemory) { 35 | success = llama_model_load_lowmem(URL.fileSystemRepresentation, model, vocab, contextSize); 36 | } else { 37 | success = llama_model_load(URL.fileSystemRepresentation, model, vocab, contextSize); 38 | } 39 | if (!success) { 40 | if (error) { 41 | NSString * const failureReason = [[NSString alloc] initWithFormat:@"failed to load model: %@", URL]; 42 | NSDictionary * const userInfo = @{ 43 | NSLocalizedFailureReasonErrorKey: failureReason 44 | }; 45 | *error = [[NSError alloc] initWithDomain:ALPChatModelErrorDomain 46 | code:ALPChatModelErrorCodeFailedToLoad 47 | userInfo:userInfo]; 48 | } 49 | return nil; 50 | } 51 | 52 | return [[ALPChatModel alloc] initWithModel:model vocab:vocab]; 53 | } 54 | 55 | - (instancetype)initWithModel:(const llama_model &)model vocab:(const gpt_vocab &)vocab 56 | { 57 | if (self = [super init]) { 58 | _model = model; 59 | _vocab = vocab; 60 | } 61 | return self; 62 | } 63 | 64 | - (instancetype)init 65 | { 66 | [self doesNotRecognizeSelector:_cmd]; 67 | abort(); 68 | } 69 | 70 | - (void)dealloc 71 | { 72 | llma_model_unload(_model); 73 | } 74 | 75 | @end 76 | 77 | // MARK: - 78 | 79 | @interface ALPChatPredicationCancellable : NSObject 80 | 81 | @end 82 | 83 | @implementation ALPChatPredicationCancellable 84 | { 85 | @public 86 | std::atomic _cancelled; 87 | } 88 | 89 | - (instancetype)init 90 | { 91 | if (self = [super init]) { 92 | _cancelled.store(false); 93 | } 94 | return self; 95 | } 96 | 97 | - (void)cancel 98 | { 99 | _cancelled.store(true); 100 | } 101 | 102 | @end 103 | 104 | // MARK: - 105 | 106 | NSString * const ALPChatErrorDomain = @"ALPChatErrorDomain"; 107 | 108 | @implementation ALPChat 109 | { 110 | ALPChatModel *_model; 111 | dispatch_queue_t _workerQueue; 112 | 113 | gpt_params _params; 114 | 115 | std::mt19937 _rng; 116 | 117 | int _n_past; 118 | int _n_remaining_tokens; 119 | 120 | std::vector _request_tokens; 121 | std::vector _response_tokens; 122 | 123 | std::vector _embd; 124 | std::vector _last_n_tokens; 125 | 126 | std::vector _logits; 127 | size_t _mem_per_token; 128 | 129 | bool _prepared; 130 | } 131 | 132 | - (instancetype)initWithModel:(ALPChatModel *)model 133 | { 134 | if (self = [super init]) { 135 | _model = model; 136 | _workerQueue = dispatch_queue_create("ALPChat.workerQueue", DISPATCH_QUEUE_SERIAL_WITH_AUTORELEASE_POOL); 137 | dispatch_async(_workerQueue, ^{ 138 | [self _alp_worker_initialize]; 139 | }); 140 | } 141 | return self; 142 | } 143 | 144 | - (instancetype)init 145 | { 146 | [self doesNotRecognizeSelector:_cmd]; 147 | abort(); 148 | } 149 | 150 | - (void)_alp_worker_initialize 151 | { 152 | // Use mostly default values. 153 | _params.temp = 0.1f; 154 | _params.n_threads = (int32_t)std::thread::hardware_concurrency(); 155 | #if DEBUG 156 | fprintf(stderr, "%s: hardware concurrency = %d\n", __func__, (int32_t) std::thread::hardware_concurrency()); 157 | fprintf(stderr, "%s: n_threads = %d\n", __func__, _params.n_threads); 158 | #endif // DEBUG 159 | 160 | const int32_t seed = (int32_t)time(NULL); 161 | _rng = std::mt19937(seed); 162 | 163 | _n_past = 0; 164 | _n_remaining_tokens = 0; 165 | 166 | _request_tokens = ::llama_tokenize(_model->_vocab, "## Instruction:\n\n", true); 167 | _response_tokens = ::llama_tokenize(_model->_vocab, "\n## Response:\n\n", false); 168 | 169 | _last_n_tokens = std::vector(_params.repeat_last_n); 170 | std::fill(_last_n_tokens.begin(), _last_n_tokens.end(), 0); 171 | } 172 | 173 | - (id)predictTokensForPrompt:(NSString *)prompt 174 | tokenHandler:(nullable void (^)(NSString *token))tokenHandler 175 | completionHandler:(nullable void (^)(NSError * _Nullable error))completionHandler 176 | { 177 | ALPChatPredicationCancellable * const cancellable = [[ALPChatPredicationCancellable alloc] init]; 178 | dispatch_async(_workerQueue, ^{ 179 | [self _alp_worker_predictTokensForPrompt:prompt 180 | tokenHandler:tokenHandler 181 | completionHandler:completionHandler 182 | cancellable:cancellable]; 183 | }); 184 | return cancellable; 185 | } 186 | 187 | - (void)_alp_worker_predictTokensForPrompt:(NSString *)prompt 188 | tokenHandler:(nullable void (^)(NSString *token))tokenHandler 189 | completionHandler:(nullable void (^)(NSError * _Nullable error))completionHandler 190 | cancellable:(ALPChatPredicationCancellable *)cancellable 191 | { 192 | std::vector input_tokens; 193 | 194 | if (!_prepared) { 195 | // Determine the required inference memory per token. 196 | // This takes some duration. 197 | llama_eval(_model->_model, _params.n_threads, 0, { 0, 1, 2, 3 }, _logits, _mem_per_token); 198 | 199 | // We may want to slide the input window along with the context, 200 | // but for now we restrict to the context length. 201 | _n_remaining_tokens = _model->_model.hparams.n_ctx; 202 | 203 | _prepared = true; 204 | } 205 | 206 | input_tokens.insert(input_tokens.end(), _request_tokens.begin(), _request_tokens.end()); 207 | 208 | const char * const promptCString = [prompt cStringUsingEncoding:NSUTF8StringEncoding]; 209 | std::vector prompt_tokens = ::llama_tokenize(_model->_vocab, promptCString, false); 210 | input_tokens.insert(input_tokens.end(), prompt_tokens.begin(), prompt_tokens.end()); 211 | input_tokens.insert(input_tokens.end(), _response_tokens.begin(), _response_tokens.end()); 212 | 213 | _n_remaining_tokens -= _request_tokens.size() + prompt_tokens.size() + _response_tokens.size(); 214 | 215 | int n_consumed_input_tokens = 0; 216 | bool is_input_tokens_consumed = false; 217 | 218 | while (_n_remaining_tokens > 0) { 219 | if (cancellable->_cancelled.load()) { 220 | if (completionHandler) { 221 | NSError * const error = [[NSError alloc] initWithDomain:ALPChatErrorDomain 222 | code:ALPChatErrorCodeCancelled 223 | userInfo:nil]; 224 | completionHandler(error); 225 | } 226 | return; 227 | } 228 | #if DEBUG 229 | fprintf(stderr, "\nremaining_tokens = %d\n", _n_remaining_tokens); 230 | #endif // DEBUG 231 | 232 | // Predict 233 | if (_embd.size() > 0) { 234 | #if DEBUG 235 | const int64_t t_start_sample_us = ggml_time_us(); 236 | fprintf(stderr, "start predicting...\n"); 237 | #endif // DEBUG 238 | if (!llama_eval(_model->_model, _params.n_threads, _n_past, _embd, _logits, _mem_per_token)) { 239 | if (completionHandler) { 240 | NSError * const error = [[NSError alloc] initWithDomain:ALPChatErrorDomain 241 | code:ALPChatErrorCodeFailedToPredict 242 | userInfo:nil]; 243 | completionHandler(error); 244 | } 245 | return; 246 | } 247 | #if DEBUG 248 | fprintf(stderr, "done %8.2f ms\n", (ggml_time_us() - t_start_sample_us) / 1000.0f); 249 | #endif // DEBUG 250 | } 251 | 252 | _n_past += _embd.size(); 253 | _embd.clear(); 254 | 255 | if (n_consumed_input_tokens >= input_tokens.size()) { 256 | is_input_tokens_consumed = true; 257 | } 258 | 259 | if (is_input_tokens_consumed) { 260 | const float top_k = _params.top_k; 261 | const float top_p = _params.top_p; 262 | const float temp = _params.temp; 263 | const float repeat_penalty = _params.repeat_penalty; 264 | 265 | const int n_vocab = _model->_model.hparams.n_vocab; 266 | 267 | gpt_vocab::id ident = llama_sample_top_p_top_k(_model->_vocab, _logits.data() + (_logits.size() - n_vocab), _last_n_tokens, repeat_penalty, top_k, top_p, temp, _rng); 268 | 269 | _last_n_tokens.erase(_last_n_tokens.begin()); 270 | _last_n_tokens.push_back(ident); 271 | 272 | // add it to the context 273 | _embd.push_back(ident); 274 | 275 | // decrement remaining sampling budget 276 | --_n_remaining_tokens; 277 | } else { 278 | while (n_consumed_input_tokens < input_tokens.size()) { 279 | #if DEBUG 280 | fprintf(stderr, "%6d -> '%s'\n", input_tokens[n_consumed_input_tokens], _model->_vocab.id_to_token.at(input_tokens[n_consumed_input_tokens]).c_str()); 281 | #endif // DEBUG 282 | 283 | _embd.push_back(input_tokens[n_consumed_input_tokens]); 284 | 285 | _last_n_tokens.erase(_last_n_tokens.begin()); 286 | _last_n_tokens.push_back(input_tokens[n_consumed_input_tokens]); 287 | ++n_consumed_input_tokens; 288 | 289 | if (_embd.size() > _params.n_batch) { 290 | break; 291 | } 292 | } 293 | } 294 | 295 | #if DEBUG 296 | { 297 | #else 298 | if (is_input_tokens_consumed) { 299 | #endif // DEBUG 300 | for (auto ident : _embd) { 301 | const char *tokenCString = _model->_vocab.id_to_token[ident].c_str(); 302 | #if DEBUG 303 | printf("%s", tokenCString); 304 | 305 | if (is_input_tokens_consumed) { 306 | #endif // DEBUG 307 | if (tokenHandler) { 308 | NSString * const tokenString = [[NSString alloc] initWithUTF8String:tokenCString]; 309 | tokenHandler(tokenString); 310 | } 311 | #if DEBUG 312 | } 313 | #endif // DEBUG 314 | } 315 | #if DEBUG 316 | fflush(stdout); 317 | #endif // DEBUG 318 | } 319 | 320 | if (_embd.size() > 0 && _embd.back() == 2) { 321 | #if DEBUG 322 | fprintf(stderr, " [end of text]\n"); 323 | #endif // DEBUG 324 | if (completionHandler) { 325 | completionHandler(nil); 326 | } 327 | return; 328 | } 329 | } 330 | 331 | if (completionHandler) { 332 | NSError * const error = [[NSError alloc] initWithDomain:ALPChatErrorDomain 333 | code:ALPChatErrorCodeNoRemainingTokens 334 | userInfo:nil]; 335 | completionHandler(error); 336 | } 337 | } 338 | 339 | @end 340 | -------------------------------------------------------------------------------- /Applications/AlpacaChatApp.xcodeproj/project.pbxproj: -------------------------------------------------------------------------------- 1 | // !$*UTF8*$! 2 | { 3 | archiveVersion = 1; 4 | classes = { 5 | }; 6 | objectVersion = 56; 7 | objects = { 8 | 9 | /* Begin PBXBuildFile section */ 10 | 5423912729C5D1840041C234 /* AlpacaChatApp.xcconfig in Resources */ = {isa = PBXBuildFile; fileRef = 5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */; }; 11 | 54601CFB29C701F900E459DD /* ChatViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54601CFA29C701F900E459DD /* ChatViewModel.swift */; }; 12 | 5498C52229D10E0A0090856F /* String.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5498C52129D10E0A0090856F /* String.swift */; }; 13 | 54B223E629C5CF9F006F4683 /* AlpacaChatApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54B223E529C5CF9F006F4683 /* AlpacaChatApp.swift */; }; 14 | 54B223E829C5CF9F006F4683 /* ChatView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54B223E729C5CF9F006F4683 /* ChatView.swift */; }; 15 | 54B223EA29C5CF9F006F4683 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 54B223E929C5CF9F006F4683 /* Assets.xcassets */; }; 16 | 54B223ED29C5CF9F006F4683 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 54B223EC29C5CF9F006F4683 /* Preview Assets.xcassets */; }; 17 | 54B223FC29C5D075006F4683 /* AlpacaChat in Frameworks */ = {isa = PBXBuildFile; productRef = 54B223FB29C5D075006F4683 /* AlpacaChat */; }; 18 | 54B26D2F29C7D81A00A9AF05 /* model.bin in Resources */ = {isa = PBXBuildFile; fileRef = 54B26D2E29C7D81A00A9AF05 /* model.bin */; }; 19 | 54E9B23829C97AEC00958DFE /* Message.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54E9B23629C97AEC00958DFE /* Message.swift */; }; 20 | 54E9B23929C97AEC00958DFE /* MessageView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54E9B23729C97AEC00958DFE /* MessageView.swift */; }; 21 | /* End PBXBuildFile section */ 22 | 23 | /* Begin PBXFileReference section */ 24 | 5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = AlpacaChatApp.xcconfig; sourceTree = ""; }; 25 | 54601CFA29C701F900E459DD /* ChatViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatViewModel.swift; sourceTree = ""; }; 26 | 5498C52129D10E0A0090856F /* String.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = String.swift; sourceTree = ""; }; 27 | 54B223E229C5CF9F006F4683 /* AlpacaChatApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = AlpacaChatApp.app; sourceTree = BUILT_PRODUCTS_DIR; }; 28 | 54B223E529C5CF9F006F4683 /* AlpacaChatApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AlpacaChatApp.swift; sourceTree = ""; }; 29 | 54B223E729C5CF9F006F4683 /* ChatView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatView.swift; sourceTree = ""; }; 30 | 54B223E929C5CF9F006F4683 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = ""; }; 31 | 54B223EC29C5CF9F006F4683 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = ""; }; 32 | 54B223F629C5CFFA006F4683 /* AlpacaChat */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = AlpacaChat; path = ..; sourceTree = ""; }; 33 | 54B26D2D29C7CA7E00A9AF05 /* AlpacaChatApp.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = AlpacaChatApp.entitlements; sourceTree = ""; }; 34 | 54B26D2E29C7D81A00A9AF05 /* model.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; path = model.bin; sourceTree = ""; }; 35 | 54E9B23629C97AEC00958DFE /* Message.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Message.swift; sourceTree = ""; }; 36 | 54E9B23729C97AEC00958DFE /* MessageView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MessageView.swift; sourceTree = ""; }; 37 | /* End PBXFileReference section */ 38 | 39 | /* Begin PBXFrameworksBuildPhase section */ 40 | 54B223DF29C5CF9F006F4683 /* Frameworks */ = { 41 | isa = PBXFrameworksBuildPhase; 42 | buildActionMask = 2147483647; 43 | files = ( 44 | 54B223FC29C5D075006F4683 /* AlpacaChat in Frameworks */, 45 | ); 46 | runOnlyForDeploymentPostprocessing = 0; 47 | }; 48 | /* End PBXFrameworksBuildPhase section */ 49 | 50 | /* Begin PBXGroup section */ 51 | 5423912529C5D1530041C234 /* Configurations */ = { 52 | isa = PBXGroup; 53 | children = ( 54 | 5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */, 55 | ); 56 | path = Configurations; 57 | sourceTree = ""; 58 | }; 59 | 54B223D929C5CF9F006F4683 = { 60 | isa = PBXGroup; 61 | children = ( 62 | 54B223E429C5CF9F006F4683 /* AlpacaChatApp */, 63 | 54B223F529C5CFFA006F4683 /* Packages */, 64 | 54B223E329C5CF9F006F4683 /* Products */, 65 | 54B223FA29C5D075006F4683 /* Frameworks */, 66 | ); 67 | sourceTree = ""; 68 | }; 69 | 54B223E329C5CF9F006F4683 /* Products */ = { 70 | isa = PBXGroup; 71 | children = ( 72 | 54B223E229C5CF9F006F4683 /* AlpacaChatApp.app */, 73 | ); 74 | name = Products; 75 | sourceTree = ""; 76 | }; 77 | 54B223E429C5CF9F006F4683 /* AlpacaChatApp */ = { 78 | isa = PBXGroup; 79 | children = ( 80 | 5423912529C5D1530041C234 /* Configurations */, 81 | 54B223EB29C5CF9F006F4683 /* Preview Content */, 82 | 54B223F429C5CFC6006F4683 /* Resources */, 83 | 54B223F329C5CFBF006F4683 /* Sources */, 84 | 54B26D2C29C7CA6400A9AF05 /* Supporting Files */, 85 | ); 86 | path = AlpacaChatApp; 87 | sourceTree = ""; 88 | }; 89 | 54B223EB29C5CF9F006F4683 /* Preview Content */ = { 90 | isa = PBXGroup; 91 | children = ( 92 | 54B223EC29C5CF9F006F4683 /* Preview Assets.xcassets */, 93 | ); 94 | path = "Preview Content"; 95 | sourceTree = ""; 96 | }; 97 | 54B223F329C5CFBF006F4683 /* Sources */ = { 98 | isa = PBXGroup; 99 | children = ( 100 | 54B223E529C5CF9F006F4683 /* AlpacaChatApp.swift */, 101 | 54B223E729C5CF9F006F4683 /* ChatView.swift */, 102 | 54601CFA29C701F900E459DD /* ChatViewModel.swift */, 103 | 54E9B23629C97AEC00958DFE /* Message.swift */, 104 | 54E9B23729C97AEC00958DFE /* MessageView.swift */, 105 | 5498C52129D10E0A0090856F /* String.swift */, 106 | ); 107 | path = Sources; 108 | sourceTree = ""; 109 | }; 110 | 54B223F429C5CFC6006F4683 /* Resources */ = { 111 | isa = PBXGroup; 112 | children = ( 113 | 54B26D2E29C7D81A00A9AF05 /* model.bin */, 114 | 54B223E929C5CF9F006F4683 /* Assets.xcassets */, 115 | ); 116 | path = Resources; 117 | sourceTree = ""; 118 | }; 119 | 54B223F529C5CFFA006F4683 /* Packages */ = { 120 | isa = PBXGroup; 121 | children = ( 122 | 54B223F629C5CFFA006F4683 /* AlpacaChat */, 123 | ); 124 | name = Packages; 125 | sourceTree = ""; 126 | }; 127 | 54B223FA29C5D075006F4683 /* Frameworks */ = { 128 | isa = PBXGroup; 129 | children = ( 130 | ); 131 | name = Frameworks; 132 | sourceTree = ""; 133 | }; 134 | 54B26D2C29C7CA6400A9AF05 /* Supporting Files */ = { 135 | isa = PBXGroup; 136 | children = ( 137 | 54B26D2D29C7CA7E00A9AF05 /* AlpacaChatApp.entitlements */, 138 | ); 139 | path = "Supporting Files"; 140 | sourceTree = ""; 141 | }; 142 | /* End PBXGroup section */ 143 | 144 | /* Begin PBXNativeTarget section */ 145 | 54B223E129C5CF9F006F4683 /* AlpacaChatApp */ = { 146 | isa = PBXNativeTarget; 147 | buildConfigurationList = 54B223F029C5CF9F006F4683 /* Build configuration list for PBXNativeTarget "AlpacaChatApp" */; 148 | buildPhases = ( 149 | 54B223DE29C5CF9F006F4683 /* Sources */, 150 | 54B223DF29C5CF9F006F4683 /* Frameworks */, 151 | 54B223E029C5CF9F006F4683 /* Resources */, 152 | ); 153 | buildRules = ( 154 | ); 155 | dependencies = ( 156 | ); 157 | name = AlpacaChatApp; 158 | packageProductDependencies = ( 159 | 54B223FB29C5D075006F4683 /* AlpacaChat */, 160 | ); 161 | productName = AlpacaChatApp; 162 | productReference = 54B223E229C5CF9F006F4683 /* AlpacaChatApp.app */; 163 | productType = "com.apple.product-type.application"; 164 | }; 165 | /* End PBXNativeTarget section */ 166 | 167 | /* Begin PBXProject section */ 168 | 54B223DA29C5CF9F006F4683 /* Project object */ = { 169 | isa = PBXProject; 170 | attributes = { 171 | BuildIndependentTargetsInParallel = 1; 172 | LastSwiftUpdateCheck = 1430; 173 | LastUpgradeCheck = 1430; 174 | TargetAttributes = { 175 | 54B223E129C5CF9F006F4683 = { 176 | CreatedOnToolsVersion = 14.3; 177 | }; 178 | }; 179 | }; 180 | buildConfigurationList = 54B223DD29C5CF9F006F4683 /* Build configuration list for PBXProject "AlpacaChatApp" */; 181 | compatibilityVersion = "Xcode 14.0"; 182 | developmentRegion = en; 183 | hasScannedForEncodings = 0; 184 | knownRegions = ( 185 | en, 186 | Base, 187 | ); 188 | mainGroup = 54B223D929C5CF9F006F4683; 189 | productRefGroup = 54B223E329C5CF9F006F4683 /* Products */; 190 | projectDirPath = ""; 191 | projectRoot = ""; 192 | targets = ( 193 | 54B223E129C5CF9F006F4683 /* AlpacaChatApp */, 194 | ); 195 | }; 196 | /* End PBXProject section */ 197 | 198 | /* Begin PBXResourcesBuildPhase section */ 199 | 54B223E029C5CF9F006F4683 /* Resources */ = { 200 | isa = PBXResourcesBuildPhase; 201 | buildActionMask = 2147483647; 202 | files = ( 203 | 54B223ED29C5CF9F006F4683 /* Preview Assets.xcassets in Resources */, 204 | 54B26D2F29C7D81A00A9AF05 /* model.bin in Resources */, 205 | 54B223EA29C5CF9F006F4683 /* Assets.xcassets in Resources */, 206 | 5423912729C5D1840041C234 /* AlpacaChatApp.xcconfig in Resources */, 207 | ); 208 | runOnlyForDeploymentPostprocessing = 0; 209 | }; 210 | /* End PBXResourcesBuildPhase section */ 211 | 212 | /* Begin PBXSourcesBuildPhase section */ 213 | 54B223DE29C5CF9F006F4683 /* Sources */ = { 214 | isa = PBXSourcesBuildPhase; 215 | buildActionMask = 2147483647; 216 | files = ( 217 | 5498C52229D10E0A0090856F /* String.swift in Sources */, 218 | 54E9B23929C97AEC00958DFE /* MessageView.swift in Sources */, 219 | 54B223E829C5CF9F006F4683 /* ChatView.swift in Sources */, 220 | 54E9B23829C97AEC00958DFE /* Message.swift in Sources */, 221 | 54601CFB29C701F900E459DD /* ChatViewModel.swift in Sources */, 222 | 54B223E629C5CF9F006F4683 /* AlpacaChatApp.swift in Sources */, 223 | ); 224 | runOnlyForDeploymentPostprocessing = 0; 225 | }; 226 | /* End PBXSourcesBuildPhase section */ 227 | 228 | /* Begin XCBuildConfiguration section */ 229 | 54B223EE29C5CF9F006F4683 /* Debug */ = { 230 | isa = XCBuildConfiguration; 231 | buildSettings = { 232 | ALWAYS_SEARCH_USER_PATHS = NO; 233 | CLANG_ANALYZER_NONNULL = YES; 234 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 235 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; 236 | CLANG_ENABLE_MODULES = YES; 237 | CLANG_ENABLE_OBJC_ARC = YES; 238 | CLANG_ENABLE_OBJC_WEAK = YES; 239 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 240 | CLANG_WARN_BOOL_CONVERSION = YES; 241 | CLANG_WARN_COMMA = YES; 242 | CLANG_WARN_CONSTANT_CONVERSION = YES; 243 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 244 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 245 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 246 | CLANG_WARN_EMPTY_BODY = YES; 247 | CLANG_WARN_ENUM_CONVERSION = YES; 248 | CLANG_WARN_INFINITE_RECURSION = YES; 249 | CLANG_WARN_INT_CONVERSION = YES; 250 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 251 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 252 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 253 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 254 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 255 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 256 | CLANG_WARN_STRICT_PROTOTYPES = YES; 257 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 258 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 259 | CLANG_WARN_UNREACHABLE_CODE = YES; 260 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 261 | COPY_PHASE_STRIP = NO; 262 | DEBUG_INFORMATION_FORMAT = dwarf; 263 | ENABLE_STRICT_OBJC_MSGSEND = YES; 264 | ENABLE_TESTABILITY = YES; 265 | GCC_C_LANGUAGE_STANDARD = gnu11; 266 | GCC_DYNAMIC_NO_PIC = NO; 267 | GCC_NO_COMMON_BLOCKS = YES; 268 | GCC_OPTIMIZATION_LEVEL = 0; 269 | GCC_PREPROCESSOR_DEFINITIONS = ( 270 | "DEBUG=1", 271 | "$(inherited)", 272 | ); 273 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 274 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 275 | GCC_WARN_UNDECLARED_SELECTOR = YES; 276 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 277 | GCC_WARN_UNUSED_FUNCTION = YES; 278 | GCC_WARN_UNUSED_VARIABLE = YES; 279 | IPHONEOS_DEPLOYMENT_TARGET = 16.0; 280 | MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE; 281 | MTL_FAST_MATH = YES; 282 | ONLY_ACTIVE_ARCH = YES; 283 | SDKROOT = iphoneos; 284 | SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG; 285 | SWIFT_OPTIMIZATION_LEVEL = "-Onone"; 286 | }; 287 | name = Debug; 288 | }; 289 | 54B223EF29C5CF9F006F4683 /* Release */ = { 290 | isa = XCBuildConfiguration; 291 | buildSettings = { 292 | ALWAYS_SEARCH_USER_PATHS = NO; 293 | CLANG_ANALYZER_NONNULL = YES; 294 | CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE; 295 | CLANG_CXX_LANGUAGE_STANDARD = "gnu++20"; 296 | CLANG_ENABLE_MODULES = YES; 297 | CLANG_ENABLE_OBJC_ARC = YES; 298 | CLANG_ENABLE_OBJC_WEAK = YES; 299 | CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES; 300 | CLANG_WARN_BOOL_CONVERSION = YES; 301 | CLANG_WARN_COMMA = YES; 302 | CLANG_WARN_CONSTANT_CONVERSION = YES; 303 | CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES; 304 | CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR; 305 | CLANG_WARN_DOCUMENTATION_COMMENTS = YES; 306 | CLANG_WARN_EMPTY_BODY = YES; 307 | CLANG_WARN_ENUM_CONVERSION = YES; 308 | CLANG_WARN_INFINITE_RECURSION = YES; 309 | CLANG_WARN_INT_CONVERSION = YES; 310 | CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES; 311 | CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES; 312 | CLANG_WARN_OBJC_LITERAL_CONVERSION = YES; 313 | CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR; 314 | CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES; 315 | CLANG_WARN_RANGE_LOOP_ANALYSIS = YES; 316 | CLANG_WARN_STRICT_PROTOTYPES = YES; 317 | CLANG_WARN_SUSPICIOUS_MOVE = YES; 318 | CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE; 319 | CLANG_WARN_UNREACHABLE_CODE = YES; 320 | CLANG_WARN__DUPLICATE_METHOD_MATCH = YES; 321 | COPY_PHASE_STRIP = NO; 322 | DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym"; 323 | ENABLE_NS_ASSERTIONS = NO; 324 | ENABLE_STRICT_OBJC_MSGSEND = YES; 325 | GCC_C_LANGUAGE_STANDARD = gnu11; 326 | GCC_NO_COMMON_BLOCKS = YES; 327 | GCC_WARN_64_TO_32_BIT_CONVERSION = YES; 328 | GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; 329 | GCC_WARN_UNDECLARED_SELECTOR = YES; 330 | GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE; 331 | GCC_WARN_UNUSED_FUNCTION = YES; 332 | GCC_WARN_UNUSED_VARIABLE = YES; 333 | IPHONEOS_DEPLOYMENT_TARGET = 16.0; 334 | MTL_ENABLE_DEBUG_INFO = NO; 335 | MTL_FAST_MATH = YES; 336 | SDKROOT = iphoneos; 337 | SWIFT_COMPILATION_MODE = wholemodule; 338 | SWIFT_OPTIMIZATION_LEVEL = "-O"; 339 | VALIDATE_PRODUCT = YES; 340 | }; 341 | name = Release; 342 | }; 343 | 54B223F129C5CF9F006F4683 /* Debug */ = { 344 | isa = XCBuildConfiguration; 345 | baseConfigurationReference = 5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */; 346 | buildSettings = { 347 | ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; 348 | ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; 349 | CODE_SIGN_ENTITLEMENTS = "AlpacaChatApp/Supporting Files/AlpacaChatApp.entitlements"; 350 | CURRENT_PROJECT_VERSION = 1; 351 | DEVELOPMENT_ASSET_PATHS = "\"AlpacaChatApp/Preview Content\""; 352 | ENABLE_PREVIEWS = YES; 353 | GENERATE_INFOPLIST_FILE = YES; 354 | INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; 355 | INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; 356 | INFOPLIST_KEY_UILaunchScreen_Generation = YES; 357 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 358 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 359 | LD_RUNPATH_SEARCH_PATHS = ( 360 | "$(inherited)", 361 | "@executable_path/Frameworks", 362 | ); 363 | MARKETING_VERSION = 1.0; 364 | PRODUCT_NAME = "$(TARGET_NAME)"; 365 | SWIFT_EMIT_LOC_STRINGS = YES; 366 | SWIFT_VERSION = 5.0; 367 | TARGETED_DEVICE_FAMILY = "1,2"; 368 | }; 369 | name = Debug; 370 | }; 371 | 54B223F229C5CF9F006F4683 /* Release */ = { 372 | isa = XCBuildConfiguration; 373 | baseConfigurationReference = 5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */; 374 | buildSettings = { 375 | ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; 376 | ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; 377 | CODE_SIGN_ENTITLEMENTS = "AlpacaChatApp/Supporting Files/AlpacaChatApp.entitlements"; 378 | CURRENT_PROJECT_VERSION = 1; 379 | DEVELOPMENT_ASSET_PATHS = "\"AlpacaChatApp/Preview Content\""; 380 | ENABLE_PREVIEWS = YES; 381 | GENERATE_INFOPLIST_FILE = YES; 382 | INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; 383 | INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES; 384 | INFOPLIST_KEY_UILaunchScreen_Generation = YES; 385 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 386 | INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight"; 387 | LD_RUNPATH_SEARCH_PATHS = ( 388 | "$(inherited)", 389 | "@executable_path/Frameworks", 390 | ); 391 | MARKETING_VERSION = 1.0; 392 | PRODUCT_NAME = "$(TARGET_NAME)"; 393 | SWIFT_EMIT_LOC_STRINGS = YES; 394 | SWIFT_VERSION = 5.0; 395 | TARGETED_DEVICE_FAMILY = "1,2"; 396 | }; 397 | name = Release; 398 | }; 399 | /* End XCBuildConfiguration section */ 400 | 401 | /* Begin XCConfigurationList section */ 402 | 54B223DD29C5CF9F006F4683 /* Build configuration list for PBXProject "AlpacaChatApp" */ = { 403 | isa = XCConfigurationList; 404 | buildConfigurations = ( 405 | 54B223EE29C5CF9F006F4683 /* Debug */, 406 | 54B223EF29C5CF9F006F4683 /* Release */, 407 | ); 408 | defaultConfigurationIsVisible = 0; 409 | defaultConfigurationName = Release; 410 | }; 411 | 54B223F029C5CF9F006F4683 /* Build configuration list for PBXNativeTarget "AlpacaChatApp" */ = { 412 | isa = XCConfigurationList; 413 | buildConfigurations = ( 414 | 54B223F129C5CF9F006F4683 /* Debug */, 415 | 54B223F229C5CF9F006F4683 /* Release */, 416 | ); 417 | defaultConfigurationIsVisible = 0; 418 | defaultConfigurationName = Release; 419 | }; 420 | /* End XCConfigurationList section */ 421 | 422 | /* Begin XCSwiftPackageProductDependency section */ 423 | 54B223FB29C5D075006F4683 /* AlpacaChat */ = { 424 | isa = XCSwiftPackageProductDependency; 425 | productName = AlpacaChat; 426 | }; 427 | /* End XCSwiftPackageProductDependency section */ 428 | }; 429 | rootObject = 54B223DA29C5CF9F006F4683 /* Project object */; 430 | } 431 | -------------------------------------------------------------------------------- /Sources/alpaca.cpp/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #if defined(_MSC_VER) || defined(__MINGW32__) 13 | #include // using malloc.h with MSC/MINGW 14 | #elif !defined(__FreeBSD__) && !defined(__NetBSD__) 15 | #include 16 | #endif 17 | 18 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { 19 | for (int i = 1; i < argc; i++) { 20 | std::string arg = argv[i]; 21 | 22 | if (arg == "-s" || arg == "--seed") { 23 | params.seed = std::stoi(argv[++i]); 24 | } else if (arg == "-t" || arg == "--threads") { 25 | params.n_threads = std::stoi(argv[++i]); 26 | } else if (arg == "-p" || arg == "--prompt") { 27 | params.prompt = argv[++i]; 28 | } else if (arg == "-f" || arg == "--file") { 29 | 30 | std::ifstream file(argv[++i]); 31 | 32 | std::copy(std::istreambuf_iterator(file), 33 | std::istreambuf_iterator(), 34 | back_inserter(params.prompt)); 35 | 36 | } else if (arg == "-n" || arg == "--n_predict") { 37 | params.n_predict = std::stoi(argv[++i]); 38 | } else if (arg == "--top_k") { 39 | params.top_k = std::stoi(argv[++i]); 40 | } else if (arg == "-c" || arg == "--ctx_size") { 41 | params.n_ctx = std::stoi(argv[++i]); 42 | } else if (arg == "--top_p") { 43 | params.top_p = std::stof(argv[++i]); 44 | } else if (arg == "--temp") { 45 | params.temp = std::stof(argv[++i]); 46 | } else if (arg == "--repeat_last_n") { 47 | params.repeat_last_n = std::stoi(argv[++i]); 48 | } else if (arg == "--repeat_penalty") { 49 | params.repeat_penalty = std::stof(argv[++i]); 50 | } else if (arg == "-b" || arg == "--batch_size") { 51 | params.n_batch = std::stoi(argv[++i]); 52 | } else if (arg == "-m" || arg == "--model") { 53 | params.model = argv[++i]; 54 | } else if (arg == "-i" || arg == "--interactive") { 55 | params.interactive = true; 56 | } else if (arg == "--interactive-start") { 57 | params.interactive = true; 58 | params.interactive_start = true; 59 | } else if (arg == "--color") { 60 | params.use_color = true; 61 | } else if (arg == "-r" || arg == "--reverse-prompt") { 62 | params.antiprompt = argv[++i]; 63 | } else if (arg == "-h" || arg == "--help") { 64 | gpt_print_usage(argc, argv, params); 65 | exit(0); 66 | } else { 67 | fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); 68 | gpt_print_usage(argc, argv, params); 69 | exit(0); 70 | } 71 | } 72 | 73 | return true; 74 | } 75 | 76 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { 77 | fprintf(stderr, "usage: %s [options]\n", argv[0]); 78 | fprintf(stderr, "\n"); 79 | fprintf(stderr, "options:\n"); 80 | fprintf(stderr, " -h, --help show this help message and exit\n"); 81 | fprintf(stderr, " -i, --interactive run in interactive mode\n"); 82 | fprintf(stderr, " --interactive-start run in interactive mode and poll user input at startup\n"); 83 | fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); 84 | fprintf(stderr, " in interactive mode, poll user input upon seeing PROMPT\n"); 85 | fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); 86 | fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n"); 87 | fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); 88 | fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); 89 | fprintf(stderr, " prompt to start generation with (default: random)\n"); 90 | fprintf(stderr, " -f FNAME, --file FNAME\n"); 91 | fprintf(stderr, " prompt file to start generation.\n"); 92 | fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict); 93 | fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); 94 | fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); 95 | fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); 96 | fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty); 97 | fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); 98 | fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); 99 | fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); 100 | fprintf(stderr, " -m FNAME, --model FNAME\n"); 101 | fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); 102 | fprintf(stderr, "\n"); 103 | } 104 | 105 | std::string gpt_random_prompt(std::mt19937 & rng) { 106 | const int r = rng() % 10; 107 | switch (r) { 108 | case 0: return "So"; 109 | case 1: return "Once upon a time"; 110 | case 2: return "When"; 111 | case 3: return "The"; 112 | case 4: return "After"; 113 | case 5: return "If"; 114 | case 6: return "import"; 115 | case 7: return "He"; 116 | case 8: return "She"; 117 | case 9: return "They"; 118 | default: return "To"; 119 | } 120 | 121 | return "The"; 122 | } 123 | 124 | void replace(std::string & str, const std::string & needle, const std::string & replacement) { 125 | size_t pos = 0; 126 | while ((pos = str.find(needle, pos)) != std::string::npos) { 127 | str.replace(pos, needle.length(), replacement); 128 | pos += replacement.length(); 129 | } 130 | } 131 | 132 | std::map json_parse(const std::string & fname) { 133 | std::map result; 134 | 135 | // read file into string 136 | std::string json; 137 | { 138 | std::ifstream ifs(fname); 139 | if (!ifs) { 140 | fprintf(stderr, "Failed to open %s\n", fname.c_str()); 141 | exit(1); 142 | } 143 | 144 | json = std::string((std::istreambuf_iterator(ifs)), 145 | (std::istreambuf_iterator())); 146 | } 147 | 148 | if (json[0] != '{') { 149 | return result; 150 | } 151 | 152 | // parse json 153 | { 154 | bool has_key = false; 155 | bool in_token = false; 156 | 157 | std::string str_key = ""; 158 | std::string str_val = ""; 159 | 160 | int n = json.size(); 161 | for (int i = 1; i < n; ++i) { 162 | if (!in_token) { 163 | if (json[i] == ' ') continue; 164 | if (json[i] == '"') { 165 | in_token = true; 166 | continue; 167 | } 168 | } else { 169 | if (json[i] == '\\' && i+1 < n) { 170 | if (has_key == false) { 171 | str_key += json[i]; 172 | } else { 173 | str_val += json[i]; 174 | } 175 | ++i; 176 | } else if (json[i] == '"') { 177 | if (has_key == false) { 178 | has_key = true; 179 | ++i; 180 | while (json[i] == ' ') ++i; 181 | ++i; // : 182 | while (json[i] == ' ') ++i; 183 | if (json[i] != '\"') { 184 | while (json[i] != ',' && json[i] != '}') { 185 | str_val += json[i++]; 186 | } 187 | has_key = false; 188 | } else { 189 | in_token = true; 190 | continue; 191 | } 192 | } else { 193 | has_key = false; 194 | } 195 | 196 | ::replace(str_key, "\\u0120", " " ); // \u0120 -> space 197 | ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line 198 | ::replace(str_key, "\\\"", "\""); // \\\" -> " 199 | 200 | try { 201 | result[str_key] = std::stoi(str_val); 202 | } catch (...) { 203 | //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str()); 204 | 205 | } 206 | str_key = ""; 207 | str_val = ""; 208 | in_token = false; 209 | continue; 210 | } 211 | if (has_key == false) { 212 | str_key += json[i]; 213 | } else { 214 | str_val += json[i]; 215 | } 216 | } 217 | } 218 | } 219 | 220 | return result; 221 | } 222 | 223 | std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { 224 | std::vector words; 225 | 226 | // first split the text into words 227 | { 228 | std::string str = text; 229 | std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; 230 | 231 | std::regex re(pat); 232 | std::smatch m; 233 | 234 | while (std::regex_search(str, m, re)) { 235 | for (auto x : m) { 236 | words.push_back(x); 237 | } 238 | str = m.suffix(); 239 | } 240 | } 241 | 242 | // find the longest tokens that form the words: 243 | std::vector tokens; 244 | for (const auto & word : words) { 245 | if (word.size() == 0) continue; 246 | 247 | int i = 0; 248 | int n = word.size(); 249 | while (i < n) { 250 | int j = n; 251 | while (j > i) { 252 | auto it = vocab.token_to_id.find(word.substr(i, j-i)); 253 | if (it != vocab.token_to_id.end()) { 254 | tokens.push_back(it->second); 255 | i = j; 256 | break; 257 | } 258 | --j; 259 | } 260 | if (i == n) { 261 | break; 262 | } 263 | if (j == i) { 264 | auto sub = word.substr(i, 1); 265 | if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { 266 | tokens.push_back(vocab.token_to_id.at(sub)); 267 | } else { 268 | fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); 269 | } 270 | ++i; 271 | } 272 | } 273 | } 274 | 275 | return tokens; 276 | } 277 | 278 | // TODO: Calculate this constant from the vocabulary 279 | #define MAX_TOKEN_LEN 18 280 | // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece 281 | std::vector llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) { 282 | std::vector res; 283 | std::vector score; 284 | std::vector prev; 285 | int len = text.length(); 286 | 287 | score.resize(len + 1); 288 | prev.resize(len + 1); 289 | 290 | // Forward pass 291 | for (int i = 0; i < len; i++) { 292 | int max_len = std::min(len - i, MAX_TOKEN_LEN); 293 | for (int sub_len = 1; sub_len <= len - i; sub_len++) { 294 | auto sub = text.substr(i, sub_len); 295 | auto token = vocab.token_to_id.find(sub); 296 | if (token != vocab.token_to_id.end()) { 297 | int token_score = sub.length() * sub.length(); 298 | int local_score = score[i] + token_score; 299 | int next = i + sub_len; 300 | if (score[next] < local_score) { 301 | score[next] = local_score; 302 | prev[next] = (*token).second; 303 | } 304 | } 305 | } 306 | } 307 | 308 | // Backward pass 309 | int i = len; 310 | while (i > 0) { 311 | gpt_vocab::id token_id = prev[i]; 312 | if (token_id == 0) { 313 | // TODO: Return error or something more meaningful 314 | printf("failed to tokenize string!\n"); 315 | break; 316 | } 317 | res.push_back(token_id); 318 | auto token = (*vocab.id_to_token.find(token_id)).second; 319 | i -= token.length(); 320 | } 321 | 322 | if (bos) { 323 | res.push_back(1); // TODO: replace with vocab.bos 324 | } 325 | 326 | // Pieces are in reverse order so correct that 327 | std::reverse(res.begin(), res.end()); 328 | 329 | return res; 330 | } 331 | 332 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) { 333 | printf("%s: loading vocab from '%s'\n", __func__, fname.c_str()); 334 | 335 | vocab.token_to_id = ::json_parse(fname); 336 | 337 | for (const auto & kv : vocab.token_to_id) { 338 | vocab.id_to_token[kv.second] = kv.first; 339 | } 340 | 341 | printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size()); 342 | 343 | // print the vocabulary 344 | //for (auto kv : vocab.token_to_id) { 345 | // printf("'%s' -> %d\n", kv.first.data(), kv.second); 346 | //} 347 | 348 | return true; 349 | } 350 | 351 | 352 | void sample_top_k(std::vector> & logits_id, int top_k) { 353 | // find the top K tokens 354 | std::partial_sort( 355 | logits_id.begin(), 356 | logits_id.begin() + top_k, logits_id.end(), 357 | [](const std::pair & a, const std::pair & b) { 358 | return a.first > b.first; 359 | }); 360 | 361 | logits_id.resize(top_k); 362 | } 363 | 364 | gpt_vocab::id llama_sample_top_p_top_k( 365 | const gpt_vocab & vocab, 366 | const float * logits, 367 | std::vector & last_n_tokens, 368 | double repeat_penalty, 369 | int top_k, 370 | double top_p, 371 | double temp, 372 | std::mt19937 & rng) { 373 | int n_logits = vocab.id_to_token.size(); 374 | 375 | std::vector> logits_id; 376 | logits_id.reserve(n_logits); 377 | 378 | { 379 | const double scale = 1.0/temp; 380 | for (int i = 0; i < n_logits; ++i) { 381 | // repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) 382 | // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main 383 | if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) { 384 | // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability 385 | if (logits[i] < 0.0) { 386 | logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i)); 387 | } else { 388 | logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i)); 389 | } 390 | } else { 391 | logits_id.push_back(std::make_pair(logits[i]*scale, i)); 392 | } 393 | } 394 | } 395 | 396 | sample_top_k(logits_id, top_k); 397 | 398 | double maxl = -INFINITY; 399 | for (const auto & kv : logits_id) { 400 | maxl = std::max(maxl, kv.first); 401 | } 402 | 403 | // compute probs for the top K tokens 404 | std::vector probs; 405 | probs.reserve(logits_id.size()); 406 | 407 | double sum = 0.0; 408 | for (const auto & kv : logits_id) { 409 | double p = exp(kv.first - maxl); 410 | probs.push_back(p); 411 | sum += p; 412 | } 413 | 414 | // normalize the probs 415 | for (auto & p : probs) { 416 | p /= sum; 417 | } 418 | 419 | if (top_p < 1.0f) { 420 | double cumsum = 0.0f; 421 | for (int i = 0; i < (int) probs.size(); i++) { 422 | cumsum += probs[i]; 423 | if (cumsum >= top_p) { 424 | probs.resize(i + 1); 425 | logits_id.resize(i + 1); 426 | break; 427 | } 428 | } 429 | 430 | cumsum = 1.0/cumsum; 431 | for (int i = 0; i < (int) probs.size(); i++) { 432 | probs[i] *= cumsum; 433 | } 434 | } 435 | 436 | //printf("\n"); 437 | //for (int i = 0; i < (int) 10; i++) { 438 | // printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]); 439 | //} 440 | //printf("\n\n"); 441 | //exit(0); 442 | 443 | std::discrete_distribution<> dist(probs.begin(), probs.end()); 444 | int idx = dist(rng); 445 | 446 | return logits_id[idx].second; 447 | } 448 | 449 | 450 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) { 451 | const int nb = k / qk; 452 | const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); 453 | const size_t row_size = nb*bs; 454 | 455 | assert(k % qk == 0); 456 | 457 | const size_t pp_size = qk / 2; 458 | uint8_t *pp = static_cast(alloca(pp_size)); 459 | 460 | char * pdst = (char *) dst; 461 | 462 | for (int j = 0; j < n; j += k) { 463 | uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); 464 | uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); 465 | 466 | for (int i = 0; i < nb; i++) { 467 | float amax = 0.0f; // absolute max 468 | 469 | { 470 | for (int l = 0; l < qk; l++) { 471 | const float v = src[j + i*qk + l]; 472 | amax = std::max(amax, fabsf(v)); 473 | } 474 | 475 | const float d = amax / ((1 << 3) - 1); 476 | const float id = d ? 1.0f/d : 0.0f; 477 | 478 | *(float *) pd = d; 479 | pd += bs; 480 | 481 | for (int l = 0; l < qk; l += 2) { 482 | const float v0 = (src[j + i*qk + l + 0])*id; 483 | const float v1 = (src[j + i*qk + l + 1])*id; 484 | 485 | const uint8_t vi0 = ((int8_t) (round(v0))) + 8; 486 | const uint8_t vi1 = ((int8_t) (round(v1))) + 8; 487 | 488 | assert(vi0 >= 0 && vi0 < 16); 489 | assert(vi1 >= 0 && vi1 < 16); 490 | 491 | hist[vi0]++; 492 | hist[vi1]++; 493 | 494 | pp[l/2] = vi0 | (vi1 << 4); 495 | } 496 | 497 | memcpy(pb, pp, pp_size); 498 | pb += bs; 499 | } 500 | } 501 | } 502 | 503 | return (n/k)*row_size; 504 | } 505 | 506 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) { 507 | const int nb = k / qk; 508 | const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2); 509 | 510 | assert(k % qk == 0); 511 | 512 | const size_t pp_size = qk / 2; 513 | uint8_t *pp = static_cast(alloca(pp_size)); 514 | 515 | char * pdst = (char *) dst; 516 | 517 | for (int j = 0; j < n; j += k) { 518 | float * pm = (float *) (pdst + (j/k)*row_size); 519 | float * pd = (float *) (pm + nb); 520 | uint8_t * pb = (uint8_t *) (pd + nb); 521 | 522 | //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb); 523 | 524 | for (int i = 0; i < nb; i++) { 525 | float min = std::numeric_limits::max(); 526 | float max = std::numeric_limits::min(); 527 | 528 | { 529 | for (int l = 0; l < qk; l++) { 530 | const float v = src[j + i*qk + l]; 531 | if (v < min) min = v; 532 | if (v > max) max = v; 533 | } 534 | 535 | const float d = (max - min) / ((1 << 4) - 1); 536 | const float id = d ? 1.0f/d : 0.0f; 537 | 538 | pm[i] = min; 539 | pd[i] = d; 540 | 541 | for (int l = 0; l < qk; l += 2) { 542 | const float v0 = (src[j + i*qk + l + 0] - min)*id; 543 | const float v1 = (src[j + i*qk + l + 1] - min)*id; 544 | 545 | const uint8_t vi0 = round(v0); 546 | const uint8_t vi1 = round(v1); 547 | 548 | assert(vi0 >= 0 && vi0 < 16); 549 | assert(vi1 >= 0 && vi1 < 16); 550 | 551 | hist[vi0]++; 552 | hist[vi1]++; 553 | 554 | pp[l/2] = vi0 | (vi1 << 4); 555 | } 556 | 557 | memcpy(pb + i*qk/2, pp, pp_size); 558 | } 559 | } 560 | } 561 | 562 | return (n/k)*row_size; 563 | } 564 | -------------------------------------------------------------------------------- /Sources/alpaca.cpp/include/ggml.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // 4 | // GGML Tensor Library 5 | // 6 | // This documentation is still a work in progress. 7 | // If you wish some specific topics to be covered, feel free to drop a comment: 8 | // 9 | // https://github.com/ggerganov/whisper.cpp/issues/40 10 | // 11 | // ## Overview 12 | // 13 | // This library implements: 14 | // 15 | // - a set of tensor operations 16 | // - automatic differentiation 17 | // - basic optimization algorithms 18 | // 19 | // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes, 20 | // but is not limited to, the following: 21 | // 22 | // - linear regression 23 | // - support vector machines 24 | // - neural networks 25 | // 26 | // The library allows the user to define a certain function using the available tensor operations. This function 27 | // definition is represented internally via a computation graph. Each tensor operation in the function definition 28 | // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the 29 | // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized 30 | // using one of the available optimization algorithms. 31 | // 32 | // For example, here we define the function: f(x) = a*x^2 + b 33 | // 34 | // { 35 | // struct ggml_init_params params = { 36 | // .mem_size = 16*1024*1024, 37 | // .mem_buffer = NULL, 38 | // }; 39 | // 40 | // // memory allocation happens here 41 | // struct ggml_context * ctx = ggml_init(params); 42 | // 43 | // struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 44 | // 45 | // ggml_set_param(ctx, x); // x is an input variable 46 | // 47 | // struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 48 | // struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1); 49 | // struct ggml_tensor * x2 = ggml_mul(ctx, x, x); 50 | // struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b); 51 | // 52 | // ... 53 | // } 54 | // 55 | // Notice that the function definition above does not involve any actual computation. The computation is performed only 56 | // when the user explicitly requests it. For example, to compute the function's value at x = 2.0: 57 | // 58 | // { 59 | // ... 60 | // 61 | // struct ggml_cgraph gf = ggml_build_forward(f); 62 | // 63 | // // set the input variable and parameter values 64 | // ggml_set_f32(x, 2.0f); 65 | // ggml_set_f32(a, 3.0f); 66 | // ggml_set_f32(b, 4.0f); 67 | // 68 | // ggml_graph_compute(ctx0, &gf); 69 | // 70 | // printf("f = %f\n", ggml_get_f32_1d(f, 0)); 71 | // 72 | // ... 73 | // } 74 | // 75 | // The actual computation is performed in the ggml_graph_compute() function. 76 | // 77 | // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the 78 | // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know 79 | // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory 80 | // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was 81 | // actually needed. 82 | // 83 | // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic 84 | // differentiation and optimization algorithms. 85 | // 86 | // The described approach allows to define the function graph once and then compute its forward or backward graphs 87 | // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way 88 | // the user can avoid the memory allocation overhead at runtime. 89 | // 90 | // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class 91 | // citizens, but in theory the library can be extended to support FP8 and integer data types. 92 | // 93 | // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary 94 | // and binary operations. Most of the available operations fall into one of these two categories. With time, it became 95 | // clear that the library needs to support more complex operations. The way to support these operations is not clear 96 | // yet, but a few examples are demonstrated in the following operations: 97 | // 98 | // - ggml_permute() 99 | // - ggml_conv_1d_1s() 100 | // - ggml_conv_1d_2s() 101 | // 102 | // For each tensor operator, the library implements a forward and backward computation function. The forward function 103 | // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the 104 | // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a 105 | // calculus class, or watch the following video: 106 | // 107 | // What is Automatic Differentiation? 108 | // https://www.youtube.com/watch?v=wG_nF1awSSY 109 | // 110 | // 111 | // ## Tensor data (struct ggml_tensor) 112 | // 113 | // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of 114 | // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains 115 | // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example: 116 | // 117 | // { 118 | // struct ggml_tensor * c = ggml_add(ctx, a, b); 119 | // 120 | // assert(c->src[0] == a); 121 | // assert(c->src[1] == b); 122 | // } 123 | // 124 | // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the 125 | // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows 126 | // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and 127 | // permutation. All tensor operations have to take the stride into account and not assume that the tensor is 128 | // contiguous in memory. 129 | // 130 | // The data of the tensor is accessed via the "data" pointer. For example: 131 | // 132 | // { 133 | // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3); 134 | // 135 | // // a[1, 2] = 1.0f; 136 | // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f; 137 | // 138 | // // a[2, 0] = 2.0f; 139 | // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f; 140 | // 141 | // ... 142 | // } 143 | // 144 | // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used. 145 | // 146 | // ## The matrix multiplication operator (ggml_mul_mat) 147 | // 148 | // TODO 149 | // 150 | // 151 | // ## Multi-threading 152 | // 153 | // TODO 154 | // 155 | // 156 | // ## Overview of ggml.c 157 | // 158 | // TODO 159 | // 160 | // 161 | // ## SIMD optimizations 162 | // 163 | // TODO 164 | // 165 | // 166 | // ## Debugging ggml 167 | // 168 | // TODO 169 | // 170 | // 171 | 172 | #ifdef __cplusplus 173 | extern "C" { 174 | #endif 175 | 176 | #include 177 | #include 178 | #include 179 | 180 | #define GGML_MAX_DIMS 4 181 | #define GGML_MAX_NODES 4096 182 | #define GGML_MAX_PARAMS 16 183 | #define GGML_MAX_CONTEXTS 64 184 | #define GGML_MAX_OPT 4 185 | 186 | #ifdef __ARM_NEON 187 | // we use the built-in 16-bit float type 188 | typedef __fp16 ggml_fp16_t; 189 | #else 190 | typedef uint16_t ggml_fp16_t; 191 | #endif 192 | 193 | // convert FP16 <-> FP32 194 | float ggml_fp16_to_fp32(ggml_fp16_t x); 195 | ggml_fp16_t ggml_fp32_to_fp16(float x); 196 | 197 | struct ggml_object; 198 | struct ggml_context; 199 | 200 | enum ggml_type { 201 | GGML_TYPE_Q4_0, 202 | GGML_TYPE_Q4_1, 203 | GGML_TYPE_I8, 204 | GGML_TYPE_I16, 205 | GGML_TYPE_I32, 206 | GGML_TYPE_F16, 207 | GGML_TYPE_F32, 208 | GGML_TYPE_COUNT, 209 | }; 210 | 211 | // available tensor operations: 212 | enum ggml_op { 213 | GGML_OP_NONE = 0, 214 | 215 | GGML_OP_DUP, 216 | GGML_OP_ADD, 217 | GGML_OP_SUB, 218 | GGML_OP_MUL, 219 | GGML_OP_DIV, 220 | GGML_OP_SQR, 221 | GGML_OP_SQRT, 222 | GGML_OP_SUM, 223 | GGML_OP_MEAN, 224 | GGML_OP_REPEAT, 225 | GGML_OP_ABS, 226 | GGML_OP_SGN, 227 | GGML_OP_NEG, 228 | GGML_OP_STEP, 229 | GGML_OP_RELU, 230 | GGML_OP_GELU, 231 | GGML_OP_SILU, 232 | GGML_OP_NORM, // normalize 233 | GGML_OP_RMS_NORM, 234 | 235 | GGML_OP_MUL_MAT, 236 | 237 | GGML_OP_SCALE, 238 | GGML_OP_CPY, 239 | GGML_OP_RESHAPE, 240 | GGML_OP_VIEW, 241 | GGML_OP_PERMUTE, 242 | GGML_OP_TRANSPOSE, 243 | GGML_OP_GET_ROWS, 244 | GGML_OP_DIAG_MASK_INF, 245 | GGML_OP_SOFT_MAX, 246 | GGML_OP_ROPE, 247 | GGML_OP_CONV_1D_1S, 248 | GGML_OP_CONV_1D_2S, 249 | 250 | GGML_OP_FLASH_ATTN, 251 | GGML_OP_FLASH_FF, 252 | 253 | GGML_OP_COUNT, 254 | }; 255 | 256 | // n-dimensional tensor 257 | struct ggml_tensor { 258 | enum ggml_type type; 259 | 260 | int n_dims; 261 | int ne[GGML_MAX_DIMS]; // number of elements 262 | size_t nb[GGML_MAX_DIMS]; // stride in bytes: 263 | // nb[0] = sizeof(type) 264 | // nb[1] = nb[0] * ne[0] + padding 265 | // nb[i] = nb[i-1] * ne[i-1] 266 | 267 | // compute data 268 | enum ggml_op op; 269 | 270 | bool is_param; 271 | 272 | struct ggml_tensor * grad; 273 | struct ggml_tensor * src0; 274 | struct ggml_tensor * src1; 275 | struct ggml_tensor * opt[GGML_MAX_OPT]; 276 | 277 | // thread scheduling 278 | int n_tasks; 279 | 280 | // performance 281 | int perf_runs; 282 | int64_t perf_cycles; 283 | int64_t perf_time_us; 284 | 285 | void * data; 286 | char padding[8]; 287 | }; 288 | 289 | // computation graph 290 | struct ggml_cgraph { 291 | int n_nodes; 292 | int n_leafs; 293 | int n_threads; 294 | 295 | size_t work_size; 296 | struct ggml_tensor * work; 297 | 298 | struct ggml_tensor * nodes[GGML_MAX_NODES]; 299 | struct ggml_tensor * grads[GGML_MAX_NODES]; 300 | struct ggml_tensor * leafs[GGML_MAX_NODES]; 301 | 302 | // performance 303 | int perf_runs; 304 | int64_t perf_cycles; 305 | int64_t perf_time_us; 306 | }; 307 | 308 | // scratch buffer 309 | struct ggml_scratch { 310 | size_t offs; 311 | size_t size; 312 | void * data; 313 | }; 314 | 315 | struct ggml_init_params { 316 | // memory pool 317 | size_t mem_size; // bytes 318 | void * mem_buffer; // if NULL, memory will be allocated internally 319 | }; 320 | 321 | void ggml_time_init(void); // call this once at the beginning of the program 322 | int64_t ggml_time_ms(void); 323 | int64_t ggml_time_us(void); 324 | int64_t ggml_cycles(void); 325 | int64_t ggml_cycles_per_ms(void); 326 | 327 | void ggml_print_object (const struct ggml_object * obj); 328 | void ggml_print_objects(const struct ggml_context * ctx); 329 | 330 | int ggml_nelements(const struct ggml_tensor * tensor); 331 | size_t ggml_nbytes (const struct ggml_tensor * tensor); 332 | 333 | int ggml_blck_size (enum ggml_type type); 334 | size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block 335 | float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float 336 | 337 | size_t ggml_element_size(const struct ggml_tensor * tensor); 338 | 339 | struct ggml_context * ggml_init(struct ggml_init_params params); 340 | void ggml_free(struct ggml_context * ctx); 341 | 342 | size_t ggml_used_mem(const struct ggml_context * ctx); 343 | 344 | size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); 345 | 346 | struct ggml_tensor * ggml_new_tensor( 347 | struct ggml_context * ctx, 348 | enum ggml_type type, 349 | int n_dims, 350 | const int *ne); 351 | 352 | struct ggml_tensor * ggml_new_tensor_1d( 353 | struct ggml_context * ctx, 354 | enum ggml_type type, 355 | int ne0); 356 | 357 | 358 | struct ggml_tensor * ggml_new_tensor_1d_dummy( 359 | struct ggml_context * ctx, 360 | enum ggml_type type, 361 | int ne0); 362 | 363 | 364 | struct ggml_tensor * ggml_new_tensor_2d_dummy( 365 | struct ggml_context * ctx, 366 | enum ggml_type type, 367 | int ne0, 368 | int ne1); 369 | 370 | 371 | struct ggml_tensor * ggml_new_tensor_2d( 372 | struct ggml_context * ctx, 373 | enum ggml_type type, 374 | int ne0, 375 | int ne1); 376 | 377 | struct ggml_tensor * ggml_new_tensor_3d( 378 | struct ggml_context * ctx, 379 | enum ggml_type type, 380 | int ne0, 381 | int ne1, 382 | int ne2); 383 | 384 | struct ggml_tensor * ggml_new_tensor_4d( 385 | struct ggml_context * ctx, 386 | enum ggml_type type, 387 | int ne0, 388 | int ne1, 389 | int ne2, 390 | int ne3); 391 | 392 | struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); 393 | struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); 394 | 395 | struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); 396 | struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); 397 | 398 | struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); 399 | struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); 400 | struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); 401 | 402 | int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); 403 | void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); 404 | 405 | float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); 406 | void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); 407 | 408 | void * ggml_get_data (const struct ggml_tensor * tensor); 409 | float * ggml_get_data_f32(const struct ggml_tensor * tensor); 410 | 411 | // 412 | // operations on tensors with backpropagation 413 | // 414 | 415 | struct ggml_tensor * ggml_dup( 416 | struct ggml_context * ctx, 417 | struct ggml_tensor * a); 418 | 419 | struct ggml_tensor * ggml_add( 420 | struct ggml_context * ctx, 421 | struct ggml_tensor * a, 422 | struct ggml_tensor * b); 423 | 424 | struct ggml_tensor * ggml_sub( 425 | struct ggml_context * ctx, 426 | struct ggml_tensor * a, 427 | struct ggml_tensor * b); 428 | 429 | struct ggml_tensor * ggml_mul( 430 | struct ggml_context * ctx, 431 | struct ggml_tensor * a, 432 | struct ggml_tensor * b); 433 | 434 | struct ggml_tensor * ggml_div( 435 | struct ggml_context * ctx, 436 | struct ggml_tensor * a, 437 | struct ggml_tensor * b); 438 | 439 | struct ggml_tensor * ggml_sqr( 440 | struct ggml_context * ctx, 441 | struct ggml_tensor * a); 442 | 443 | struct ggml_tensor * ggml_sqrt( 444 | struct ggml_context * ctx, 445 | struct ggml_tensor * a); 446 | 447 | // return scalar 448 | // TODO: compute sum along rows 449 | struct ggml_tensor * ggml_sum( 450 | struct ggml_context * ctx, 451 | struct ggml_tensor * a); 452 | 453 | // mean along rows 454 | struct ggml_tensor * ggml_mean( 455 | struct ggml_context * ctx, 456 | struct ggml_tensor * a); 457 | 458 | // if a is the same shape as b, and a is not parameter, return a 459 | // otherwise, return a new tensor: repeat(a) to fit in b 460 | struct ggml_tensor * ggml_repeat( 461 | struct ggml_context * ctx, 462 | struct ggml_tensor * a, 463 | struct ggml_tensor * b); 464 | 465 | struct ggml_tensor * ggml_abs( 466 | struct ggml_context * ctx, 467 | struct ggml_tensor * a); 468 | 469 | struct ggml_tensor * ggml_sgn( 470 | struct ggml_context * ctx, 471 | struct ggml_tensor * a); 472 | 473 | struct ggml_tensor * ggml_neg( 474 | struct ggml_context * ctx, 475 | struct ggml_tensor * a); 476 | 477 | struct ggml_tensor * ggml_step( 478 | struct ggml_context * ctx, 479 | struct ggml_tensor * a); 480 | 481 | struct ggml_tensor * ggml_relu( 482 | struct ggml_context * ctx, 483 | struct ggml_tensor * a); 484 | 485 | // TODO: double-check this computation is correct 486 | struct ggml_tensor * ggml_gelu( 487 | struct ggml_context * ctx, 488 | struct ggml_tensor * a); 489 | 490 | struct ggml_tensor * ggml_silu( 491 | struct ggml_context * ctx, 492 | struct ggml_tensor * a); 493 | 494 | // normalize along rows 495 | // TODO: eps is hardcoded to 1e-5 for now 496 | struct ggml_tensor * ggml_norm( 497 | struct ggml_context * ctx, 498 | struct ggml_tensor * a); 499 | 500 | struct ggml_tensor * ggml_rms_norm( 501 | struct ggml_context * ctx, 502 | struct ggml_tensor * a); 503 | 504 | // A: m rows, n columns 505 | // B: p rows, n columns (i.e. we transpose it internally) 506 | // result is m columns, p rows 507 | struct ggml_tensor * ggml_mul_mat( 508 | struct ggml_context * ctx, 509 | struct ggml_tensor * a, 510 | struct ggml_tensor * b); 511 | 512 | // 513 | // operations on tensors without backpropagation 514 | // 515 | 516 | // in-place, returns view(a) 517 | struct ggml_tensor * ggml_scale( 518 | struct ggml_context * ctx, 519 | struct ggml_tensor * a, 520 | struct ggml_tensor * b); 521 | 522 | // a -> b, return view(b) 523 | struct ggml_tensor * ggml_cpy( 524 | struct ggml_context * ctx, 525 | struct ggml_tensor * a, 526 | struct ggml_tensor * b); 527 | 528 | // return view(a), b specifies the new shape 529 | // TODO: when we start computing gradient, make a copy instead of view 530 | struct ggml_tensor * ggml_reshape( 531 | struct ggml_context * ctx, 532 | struct ggml_tensor * a, 533 | struct ggml_tensor * b); 534 | 535 | // return view(a) 536 | // TODO: when we start computing gradient, make a copy instead of view 537 | struct ggml_tensor * ggml_reshape_2d( 538 | struct ggml_context * ctx, 539 | struct ggml_tensor * a, 540 | int ne0, 541 | int ne1); 542 | 543 | // return view(a) 544 | // TODO: when we start computing gradient, make a copy instead of view 545 | struct ggml_tensor * ggml_reshape_3d( 546 | struct ggml_context * ctx, 547 | struct ggml_tensor * a, 548 | int ne0, 549 | int ne1, 550 | int ne2); 551 | 552 | // offset in bytes 553 | struct ggml_tensor * ggml_view_1d( 554 | struct ggml_context * ctx, 555 | struct ggml_tensor * a, 556 | int ne0, 557 | size_t offset); 558 | 559 | struct ggml_tensor * ggml_view_2d( 560 | struct ggml_context * ctx, 561 | struct ggml_tensor * a, 562 | int ne0, 563 | int ne1, 564 | size_t nb1, // row stride in bytes 565 | size_t offset); 566 | 567 | struct ggml_tensor * ggml_permute( 568 | struct ggml_context * ctx, 569 | struct ggml_tensor * a, 570 | int axis0, 571 | int axis1, 572 | int axis2, 573 | int axis3); 574 | 575 | // alias for ggml_permute(ctx, a, 1, 0, 2, 3) 576 | struct ggml_tensor * ggml_transpose( 577 | struct ggml_context * ctx, 578 | struct ggml_tensor * a); 579 | 580 | struct ggml_tensor * ggml_get_rows( 581 | struct ggml_context * ctx, 582 | struct ggml_tensor * a, 583 | struct ggml_tensor * b); 584 | 585 | // set elements above the diagonal to -INF 586 | // in-place, returns view(a) 587 | struct ggml_tensor * ggml_diag_mask_inf( 588 | struct ggml_context * ctx, 589 | struct ggml_tensor * a, 590 | int n_past); 591 | 592 | // in-place, returns view(a) 593 | struct ggml_tensor * ggml_soft_max( 594 | struct ggml_context * ctx, 595 | struct ggml_tensor * a); 596 | 597 | // rotary position embedding 598 | // in-place, returns view(a) 599 | // if mode == 1, skip n_past elements 600 | // TODO: avoid creating a new tensor every time 601 | struct ggml_tensor * ggml_rope( 602 | struct ggml_context * ctx, 603 | struct ggml_tensor * a, 604 | int n_past, 605 | int n_dims, 606 | int mode); 607 | 608 | // padding = 1 609 | // TODO: we don't support extra parameters for now 610 | // that's why we are hard-coding the stride, padding, and dilation 611 | // not great .. 612 | struct ggml_tensor * ggml_conv_1d_1s( 613 | struct ggml_context * ctx, 614 | struct ggml_tensor * a, 615 | struct ggml_tensor * b); 616 | 617 | struct ggml_tensor * ggml_conv_1d_2s( 618 | struct ggml_context * ctx, 619 | struct ggml_tensor * a, 620 | struct ggml_tensor * b); 621 | 622 | struct ggml_tensor * ggml_flash_attn( 623 | struct ggml_context * ctx, 624 | struct ggml_tensor * q, 625 | struct ggml_tensor * k, 626 | struct ggml_tensor * v, 627 | bool masked); 628 | 629 | struct ggml_tensor * ggml_flash_ff( 630 | struct ggml_context * ctx, 631 | struct ggml_tensor * a, 632 | struct ggml_tensor * b0, 633 | struct ggml_tensor * b1, 634 | struct ggml_tensor * c0, 635 | struct ggml_tensor * c1); 636 | 637 | // 638 | // automatic differentiation 639 | // 640 | 641 | void ggml_set_param( 642 | struct ggml_context * ctx, 643 | struct ggml_tensor * tensor); 644 | 645 | void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); 646 | 647 | struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); 648 | struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); 649 | 650 | void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); 651 | void ggml_graph_reset (struct ggml_cgraph * cgraph); 652 | 653 | // print info and performance information for the graph 654 | void ggml_graph_print(const struct ggml_cgraph * cgraph); 655 | 656 | // dump the graph into a file using the dot format 657 | void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); 658 | 659 | // 660 | // optimization 661 | // 662 | 663 | // optimization methods 664 | enum ggml_opt_type { 665 | GGML_OPT_ADAM, 666 | GGML_OPT_LBFGS, 667 | }; 668 | 669 | // linesearch methods 670 | enum ggml_linesearch { 671 | GGML_LINESEARCH_DEFAULT = 1, 672 | 673 | GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0, 674 | GGML_LINESEARCH_BACKTRACKING_WOLFE = 1, 675 | GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2, 676 | }; 677 | 678 | // optimization return values 679 | enum ggml_opt_result { 680 | GGML_OPT_OK = 0, 681 | GGML_OPT_DID_NOT_CONVERGE, 682 | GGML_OPT_NO_CONTEXT, 683 | GGML_OPT_INVALID_WOLFE, 684 | GGML_OPT_FAIL, 685 | 686 | GGML_LINESEARCH_FAIL = -128, 687 | GGML_LINESEARCH_MINIMUM_STEP, 688 | GGML_LINESEARCH_MAXIMUM_STEP, 689 | GGML_LINESEARCH_MAXIMUM_ITERATIONS, 690 | GGML_LINESEARCH_INVALID_PARAMETERS, 691 | }; 692 | 693 | // optimization parameters 694 | // 695 | // see ggml.c (ggml_opt_default_params) for default values 696 | // 697 | struct ggml_opt_params { 698 | enum ggml_opt_type type; 699 | 700 | int n_threads; 701 | 702 | // delta-based convergence test 703 | // 704 | // if past == 0 - disabled 705 | // if past > 0: 706 | // stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|) 707 | // 708 | int past; 709 | float delta; 710 | 711 | // maximum number of iterations without improvement 712 | // 713 | // if 0 - disabled 714 | // if > 0: 715 | // assume convergence if no cost improvement in this number of iterations 716 | // 717 | int max_no_improvement; 718 | 719 | bool print_forward_graph; 720 | bool print_backward_graph; 721 | 722 | // ADAM parameters 723 | struct { 724 | int n_iter; 725 | 726 | float alpha; // learning rate 727 | float beta1; 728 | float beta2; 729 | float eps; // epsilon for numerical stability 730 | float eps_f; // epsilon for convergence test 731 | float eps_g; // epsilon for convergence test 732 | } adam; 733 | 734 | // LBFGS parameters 735 | struct { 736 | int m; // number of corrections to approximate the inv. Hessian 737 | int n_iter; 738 | int max_linesearch; 739 | 740 | float eps; // convergence tolerance 741 | float ftol; // line search tolerance 742 | float wolfe; 743 | float min_step; 744 | float max_step; 745 | 746 | enum ggml_linesearch linesearch; 747 | } lbfgs; 748 | }; 749 | 750 | struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); 751 | 752 | // optimize the function defined by the tensor f 753 | enum ggml_opt_result ggml_opt( 754 | struct ggml_context * ctx, 755 | struct ggml_opt_params params, 756 | struct ggml_tensor * f); 757 | 758 | // 759 | // system info 760 | // 761 | 762 | int ggml_cpu_has_avx(void); 763 | int ggml_cpu_has_avx2(void); 764 | int ggml_cpu_has_avx512(void); 765 | int ggml_cpu_has_fma(void); 766 | int ggml_cpu_has_neon(void); 767 | int ggml_cpu_has_arm_fma(void); 768 | int ggml_cpu_has_f16c(void); 769 | int ggml_cpu_has_fp16_va(void); 770 | int ggml_cpu_has_wasm_simd(void); 771 | int ggml_cpu_has_blas(void); 772 | int ggml_cpu_has_sse3(void); 773 | int ggml_cpu_has_vsx(void); 774 | 775 | #ifdef __cplusplus 776 | } 777 | #endif 778 | -------------------------------------------------------------------------------- /Sources/alpaca.cpp/chat.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // chat.cpp 3 | // alpaca.cpp 4 | // 5 | // Created by Yoshimasa Niwa on 3/16/23. 6 | // 7 | 8 | #include "chat.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | // determine number of model parts based on the dimension 22 | static const std::map LLAMA_N_PARTS = { 23 | { 4096, 1 }, 24 | { 5120, 1 }, 25 | { 6656, 4 }, 26 | { 8192, 8 }, 27 | }; 28 | 29 | static int fin_init(mbuf_t& mbuf, const char* fname) 30 | { 31 | int fd, nread; 32 | struct stat sb; 33 | if((fd = open(fname, O_RDONLY)) < 0){ 34 | #if DEBUG 35 | printf("mmap open %s failed\n", fname); 36 | #endif // DEBUG 37 | return -1; 38 | } 39 | if((fstat(fd, &sb)) == -1 ){ 40 | #if DEBUG 41 | printf("fstat failed\n"); 42 | #endif // DEBUG 43 | return -1; 44 | } 45 | char* model_buf = (char*)mmap(\ 46 | NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); //MAP_SHARED, MAP_PRIVATE 47 | if((void*)model_buf ==(void*) -1){ 48 | #if DEBUG 49 | printf("mmap failed\n"); 50 | #endif // DEBUG 51 | close(fd); 52 | return -1; 53 | } 54 | close(fd); 55 | mbuf.buf = model_buf; 56 | mbuf.size = (size_t)sb.st_size; 57 | mbuf.p = mbuf.buf; 58 | mbuf.oft = 0; 59 | #if DEBUG 60 | printf("mmap 0x%lx~0x%lx, size=0x%lx\r\n", (size_t)(model_buf), (size_t)(model_buf+mbuf.size), mbuf.size); 61 | #endif // DEBUG 62 | return 0; 63 | } 64 | 65 | static void fin_read(mbuf_t& mbuf, char* dst, size_t len) 66 | { 67 | if(mbuf.oft+len>mbuf.size){ 68 | len = mbuf.size - mbuf.oft; 69 | } 70 | memcpy(dst, mbuf.p, len); 71 | mbuf.oft+=len; 72 | mbuf.p+=len; 73 | } 74 | 75 | static void fin_read_dummy(mbuf_t& mbuf, char** dst, size_t len) 76 | { 77 | if(mbuf.oft+len>mbuf.size){ 78 | len = mbuf.size - mbuf.oft; 79 | } 80 | //memcpy(dst, mbuf.p, len); 81 | *dst = mbuf.p; 82 | mbuf.oft+=len; 83 | mbuf.p+=len; 84 | } 85 | 86 | static size_t fin_tellg(mbuf_t& mbuf) 87 | { 88 | return mbuf.oft; 89 | } 90 | 91 | static void fin_seekg(mbuf_t& mbuf, size_t oft) 92 | { 93 | mbuf.oft = oft; 94 | mbuf.p = mbuf.buf + oft; 95 | return; 96 | } 97 | 98 | static bool fin_eof(mbuf_t& mbuf) 99 | { 100 | return mbuf.oft>=mbuf.size; 101 | } 102 | 103 | static void fin_close(mbuf_t& mbuf) 104 | { 105 | if (!mbuf.buf) { 106 | return; 107 | } 108 | munmap(mbuf.buf, mbuf.size); 109 | mbuf.buf = NULL; 110 | mbuf.size = 0; 111 | mbuf.p = NULL; 112 | mbuf.oft = 0; 113 | } 114 | 115 | void llma_model_unload(llama_model &model) 116 | { 117 | ggml_free(model.ctx); 118 | fin_close(model.mbuf); 119 | } 120 | 121 | // load the model's weights from a file, use mmap to save memory 122 | bool llama_model_load_lowmem(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { 123 | #if DEBUG 124 | fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); 125 | 126 | fprintf(stderr, "BLAS= %d\n", ggml_cpu_has_blas()); 127 | fprintf(stderr, "NEON= %d\n", ggml_cpu_has_neon()); 128 | fprintf(stderr, "ARM_FMA= %d\n", ggml_cpu_has_arm_fma()); 129 | #endif // DEBUG 130 | std::vector f_buf(1024*1024); 131 | 132 | int res=fin_init(model.mbuf, fname.c_str()); 133 | if(res) return false; 134 | 135 | // verify magic 136 | { 137 | uint32_t magic; 138 | fin_read(model.mbuf, (char *) &magic, sizeof(magic)); 139 | if (magic != 0x67676d6c) { 140 | #if DEBUG 141 | fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); 142 | #endif // DEBUG 143 | return false; 144 | } 145 | } 146 | 147 | int n_ff = 0; 148 | int n_parts = 0; 149 | 150 | // load hparams 151 | { 152 | auto & hparams = model.hparams; 153 | 154 | fin_read(model.mbuf, (char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 155 | //fin_read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 156 | fin_read(model.mbuf, (char *) &hparams.n_embd, sizeof(hparams.n_embd)); 157 | fin_read(model.mbuf, (char *) &hparams.n_mult, sizeof(hparams.n_mult)); 158 | fin_read(model.mbuf, (char *) &hparams.n_head, sizeof(hparams.n_head)); 159 | fin_read(model.mbuf, (char *) &hparams.n_layer, sizeof(hparams.n_layer)); 160 | fin_read(model.mbuf, (char *) &hparams.n_rot, sizeof(hparams.n_rot)); 161 | fin_read(model.mbuf, (char *) &hparams.f16, sizeof(hparams.f16)); 162 | 163 | hparams.n_ctx = n_ctx; 164 | 165 | n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; 166 | n_parts = LLAMA_N_PARTS.at(hparams.n_embd); 167 | 168 | #if DEBUG 169 | fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); 170 | fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); 171 | fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); 172 | fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); 173 | fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); 174 | fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); 175 | fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); 176 | fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); 177 | fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); 178 | fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); 179 | #endif // DEBUG 180 | } 181 | 182 | // load vocab 183 | { 184 | const int32_t n_vocab = model.hparams.n_vocab; 185 | 186 | if (n_vocab != model.hparams.n_vocab) { 187 | #if DEBUG 188 | fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", 189 | __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); 190 | #endif // DEBUG 191 | return false; 192 | } 193 | 194 | std::string word; 195 | for (int i = 0; i < n_vocab; i++) { 196 | uint32_t len; 197 | fin_read(model.mbuf, (char *) &len, sizeof(len)); 198 | 199 | word.resize(len); 200 | fin_read(model.mbuf, (char *) word.data(), len); 201 | 202 | vocab.token_to_id[word] = i; 203 | vocab.id_to_token[i] = word; 204 | 205 | //if (i < 30000) { 206 | // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); 207 | //} 208 | } 209 | } 210 | 211 | // for the big tensors, we have the option to store the data in 16-bit floats or quantized 212 | // in order to save memory and also to speed up the computation 213 | ggml_type wtype = GGML_TYPE_COUNT; 214 | switch (model.hparams.f16) { 215 | case 0: wtype = GGML_TYPE_F32; break; 216 | case 1: wtype = GGML_TYPE_F16; break; 217 | case 2: wtype = GGML_TYPE_Q4_0; break; 218 | case 3: wtype = GGML_TYPE_Q4_1; break; 219 | default: 220 | { 221 | #if DEBUG 222 | fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", 223 | __func__, fname.c_str(), model.hparams.f16); 224 | #endif // DEBUG 225 | return false; 226 | } 227 | } 228 | 229 | const ggml_type wtype2 = GGML_TYPE_F32; 230 | 231 | auto & ctx = model.ctx; 232 | 233 | size_t ctx_size = 0; 234 | 235 | { 236 | const auto & hparams = model.hparams; 237 | 238 | const int n_embd = hparams.n_embd; 239 | const int n_layer = hparams.n_layer; 240 | const int n_ctx = hparams.n_ctx; 241 | const int n_vocab = hparams.n_vocab; 242 | 243 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings 244 | 245 | ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm 246 | 247 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output 248 | 249 | /* 250 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm 251 | 252 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq 253 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk 254 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv 255 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo 256 | 257 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm 258 | 259 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 260 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 261 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 262 | */ 263 | 264 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k 265 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v 266 | 267 | ctx_size += (5 + 10*n_layer)*256; // object overhead 268 | 269 | #if DEBUG 270 | fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); 271 | #endif // DEBUG 272 | } 273 | 274 | // create the ggml context 275 | { 276 | struct ggml_init_params params = { 277 | /*.mem_size =*/ ctx_size, 278 | /*.mem_buffer =*/ NULL, 279 | }; 280 | 281 | model.ctx = ggml_init(params); 282 | if (!model.ctx) { 283 | #if DEBUG 284 | fprintf(stderr, "%s: ggml_init() failed\n", __func__); 285 | #endif // DEBUG 286 | return false; 287 | } 288 | } 289 | 290 | // prepare memory for the weights 291 | { 292 | const auto & hparams = model.hparams; 293 | 294 | const int n_embd = hparams.n_embd; 295 | const int n_layer = hparams.n_layer; 296 | const int n_ctx = hparams.n_ctx; 297 | const int n_vocab = hparams.n_vocab; 298 | 299 | model.layers.resize(n_layer); 300 | 301 | model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 302 | 303 | model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 304 | model.output = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 305 | 306 | // map by name 307 | model.tensors["tok_embeddings.weight"] = model.tok_embeddings; 308 | 309 | model.tensors["norm.weight"] = model.norm; 310 | model.tensors["output.weight"] = model.output; 311 | 312 | for (int i = 0; i < n_layer; ++i) { 313 | auto & layer = model.layers[i]; 314 | 315 | layer.attention_norm = ggml_new_tensor_1d_dummy(ctx, GGML_TYPE_F32, n_embd); 316 | 317 | layer.wq = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_embd); 318 | layer.wk = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_embd); 319 | layer.wv = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_embd); 320 | layer.wo = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_embd); 321 | 322 | layer.ffn_norm = ggml_new_tensor_1d_dummy(ctx, GGML_TYPE_F32, n_embd); 323 | 324 | layer.w1 = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_ff); 325 | layer.w2 = ggml_new_tensor_2d_dummy(ctx, wtype, n_ff, n_embd); 326 | layer.w3 = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_ff); 327 | 328 | // map by name 329 | model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; 330 | 331 | model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; 332 | model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; 333 | model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; 334 | model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; 335 | 336 | model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; 337 | 338 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; 339 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; 340 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; 341 | } 342 | } 343 | 344 | // key + value memory 345 | { 346 | const auto & hparams = model.hparams; 347 | 348 | const int n_embd = hparams.n_embd; 349 | const int n_layer = hparams.n_layer; 350 | const int n_ctx = hparams.n_ctx; 351 | 352 | const int n_mem = n_layer*n_ctx; 353 | const int n_elements = n_embd*n_mem; 354 | 355 | model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); 356 | model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements); 357 | 358 | const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); 359 | #if DEBUG 360 | fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); 361 | #endif // DEBUG 362 | } 363 | 364 | const size_t file_offset = fin_tellg(model.mbuf); 365 | 366 | //fin_close(); 367 | 368 | std::vector tmp; 369 | 370 | for (int i = 0; i < n_parts; ++i) { 371 | const int part_id = i; 372 | //const int part_id = n_parts - i - 1; 373 | 374 | std::string fname_part = fname; 375 | if (i > 0) { 376 | fname_part += "." + std::to_string(i); 377 | } 378 | 379 | //fin_init(fname_part.c_str()); 380 | fin_seekg(model.mbuf, file_offset); 381 | 382 | // load weights 383 | { 384 | int n_tensors = 0; 385 | size_t total_size = 0; 386 | 387 | #if DEBUG 388 | fprintf(stderr, "%s: ", __func__); 389 | #endif // DEBUG 390 | 391 | while (true) { 392 | int32_t n_dims; 393 | int32_t length; 394 | int32_t ftype; 395 | 396 | fin_read(model.mbuf, reinterpret_cast(&n_dims), sizeof(n_dims)); 397 | fin_read(model.mbuf, reinterpret_cast(&length), sizeof(length)); 398 | fin_read(model.mbuf, reinterpret_cast(&ftype), sizeof(ftype)); 399 | 400 | if (fin_eof(model.mbuf)) { 401 | break; 402 | } 403 | 404 | int32_t nelements = 1; 405 | int32_t ne[2] = { 1, 1 }; 406 | for (int i = 0; i < n_dims; ++i) { 407 | fin_read(model.mbuf, reinterpret_cast(&ne[i]), sizeof(ne[i])); 408 | nelements *= ne[i]; 409 | } 410 | 411 | std::string name(length, 0); 412 | fin_read(model.mbuf, &name[0], length); 413 | 414 | if (model.tensors.find(name.data()) == model.tensors.end()) { 415 | #if DEBUG 416 | fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); 417 | #endif // DEBUG 418 | return false; 419 | } 420 | 421 | // split_type = 0: split by columns 422 | // split_type = 1: split by rows 423 | int split_type = 0; 424 | 425 | // split_type = 0: 426 | // regex: 427 | // - tok_embeddings.* 428 | // - layers.*.attention.wo.weight 429 | // - layers.*.feed_forward.w2.weight 430 | 431 | // split_type = 1: 432 | // regex: 433 | // - output.* 434 | // - layers.*.attention.wq.weight 435 | // - layers.*.attention.wk.weight 436 | // - layers.*.attention.wv.weight 437 | // - layers.*.feed_forward.w1.weight 438 | // - layers.*.feed_forward.w3.weight 439 | if (name.find("tok_embeddings") != std::string::npos) { 440 | split_type = 0; 441 | } else if (name.find("layers") != std::string::npos) { 442 | if (name.find("attention.wo.weight") != std::string::npos) { 443 | split_type = 0; 444 | } else if (name.find("feed_forward.w2.weight") != std::string::npos) { 445 | split_type = 0; 446 | } else { 447 | split_type = 1; 448 | } 449 | } else if (name.find("output") != std::string::npos) { 450 | split_type = 1; 451 | } 452 | 453 | auto tensor = model.tensors[name.data()]; 454 | 455 | if (n_dims == 1) { 456 | if (ggml_nelements(tensor) != nelements) { 457 | #if DEBUG 458 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 459 | #endif // DEBUG 460 | return false; 461 | } 462 | } else { 463 | if (ggml_nelements(tensor)/n_parts != nelements) { 464 | #if DEBUG 465 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 466 | #endif // DEBUG 467 | return false; 468 | } 469 | } 470 | 471 | if (n_dims == 1) { 472 | if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { 473 | #if DEBUG 474 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 475 | __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); 476 | #endif // DEBUG 477 | return false; 478 | } 479 | } else { 480 | if (split_type == 0) { 481 | if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) { 482 | #if DEBUG 483 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 484 | __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]); 485 | #endif // DEBUG 486 | return false; 487 | } 488 | } else { 489 | if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) { 490 | #if DEBUG 491 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 492 | __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]); 493 | #endif // DEBUG 494 | return false; 495 | } 496 | } 497 | } 498 | 499 | #if DEBUG 500 | if (0) { 501 | static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; 502 | fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); 503 | } 504 | #endif // DEBUG 505 | 506 | size_t bpe = 0; 507 | 508 | switch (ftype) { 509 | case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; 510 | case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; 511 | case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; 512 | case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; 513 | default: 514 | { 515 | #if DEBUG 516 | fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); 517 | #endif // DEBUG 518 | return false; 519 | } 520 | }; 521 | 522 | if (n_dims == 1 || n_parts == 1) { 523 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { 524 | #if DEBUG 525 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 526 | __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); 527 | #endif // DEBUG 528 | return false; 529 | } 530 | 531 | if (part_id == 0) { 532 | //change here to enable mmap load 533 | //fin_read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); 534 | //fin_read_dummy((char**)&tensor->data, ggml_nbytes(tensor)); 535 | size_t len = ggml_nbytes(tensor); 536 | fin_read_dummy(model.mbuf, (char**)&tensor->data, len); 537 | 538 | } else { 539 | fin_seekg(model.mbuf, ggml_nbytes(tensor)); 540 | } 541 | 542 | total_size += ggml_nbytes(tensor); 543 | } else { 544 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) { 545 | #if DEBUG 546 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 547 | __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe); 548 | #endif // DEBUG 549 | return false; 550 | } 551 | 552 | if (split_type == 0) { 553 | const int np0 = ne[0]; 554 | 555 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 556 | assert(row_size == tensor->nb[1]); 557 | 558 | for (int i1 = 0; i1 < ne[1]; ++i1) { 559 | const size_t offset_row = i1*row_size; 560 | const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 561 | fin_read(model.mbuf, reinterpret_cast(tensor->data) + offset, row_size/n_parts); 562 | } 563 | } else { 564 | const int np1 = ne[1]; 565 | 566 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 567 | 568 | for (int i1 = 0; i1 < ne[1]; ++i1) { 569 | const size_t offset_row = (i1 + part_id*np1)*row_size; 570 | fin_read(model.mbuf, reinterpret_cast(tensor->data) + offset_row, row_size); 571 | } 572 | } 573 | 574 | total_size += ggml_nbytes(tensor)/n_parts; 575 | } 576 | 577 | #if DEBUG 578 | //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); 579 | if (++n_tensors % 8 == 0) { 580 | fprintf(stderr, "."); 581 | fflush(stderr); 582 | } 583 | #endif // DEBUG 584 | } 585 | 586 | #if DEBUG 587 | fprintf(stderr, " done\n"); 588 | 589 | fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); 590 | #endif // DEBUG 591 | } 592 | 593 | //fin_close(); //can't unmap here 594 | } 595 | 596 | return true; 597 | } 598 | 599 | // load the model's weights from a file 600 | bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) { 601 | #if DEBUG 602 | fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); 603 | #endif // DEBUG 604 | 605 | std::vector f_buf(1024*1024); 606 | 607 | auto fin = std::ifstream(fname, std::ios::binary); 608 | fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); 609 | if (!fin) { 610 | #if DEBUG 611 | fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); 612 | #endif // DEBUG 613 | return false; 614 | } 615 | 616 | // verify magic 617 | { 618 | uint32_t magic; 619 | fin.read((char *) &magic, sizeof(magic)); 620 | if (magic != 0x67676d6c) { 621 | #if DEBUG 622 | fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); 623 | #endif // DEBUG 624 | return false; 625 | } 626 | } 627 | 628 | int n_ff = 0; 629 | int n_parts = 0; 630 | 631 | // load hparams 632 | { 633 | auto & hparams = model.hparams; 634 | 635 | fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); 636 | //fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); 637 | fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); 638 | fin.read((char *) &hparams.n_mult, sizeof(hparams.n_mult)); 639 | fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); 640 | fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); 641 | fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); 642 | fin.read((char *) &hparams.f16, sizeof(hparams.f16)); 643 | 644 | hparams.n_ctx = n_ctx; 645 | 646 | n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; 647 | n_parts = LLAMA_N_PARTS.at(hparams.n_embd); 648 | 649 | // fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); 650 | // fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx); 651 | // fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd); 652 | // fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult); 653 | // fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head); 654 | // fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer); 655 | // fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot); 656 | // fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); 657 | // fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff); 658 | // fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts); 659 | } 660 | 661 | // load vocab 662 | { 663 | const int32_t n_vocab = model.hparams.n_vocab; 664 | 665 | if (n_vocab != model.hparams.n_vocab) { 666 | #if DEBUG 667 | fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", 668 | __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); 669 | #endif // DEBUG 670 | return false; 671 | } 672 | 673 | std::string word; 674 | for (int i = 0; i < n_vocab; i++) { 675 | uint32_t len; 676 | fin.read((char *) &len, sizeof(len)); 677 | 678 | word.resize(len); 679 | fin.read((char *) word.data(), len); 680 | 681 | vocab.token_to_id[word] = i; 682 | vocab.id_to_token[i] = word; 683 | 684 | //if (i < 30000) { 685 | // fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str()); 686 | //} 687 | } 688 | } 689 | 690 | // for the big tensors, we have the option to store the data in 16-bit floats or quantized 691 | // in order to save memory and also to speed up the computation 692 | ggml_type wtype = GGML_TYPE_COUNT; 693 | switch (model.hparams.f16) { 694 | case 0: wtype = GGML_TYPE_F32; break; 695 | case 1: wtype = GGML_TYPE_F16; break; 696 | case 2: wtype = GGML_TYPE_Q4_0; break; 697 | case 3: wtype = GGML_TYPE_Q4_1; break; 698 | default: 699 | { 700 | #if DEBUG 701 | fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", 702 | __func__, fname.c_str(), model.hparams.f16); 703 | #endif // DEBUG 704 | return false; 705 | } 706 | } 707 | 708 | const ggml_type wtype2 = GGML_TYPE_F32; 709 | 710 | auto & ctx = model.ctx; 711 | 712 | size_t ctx_size = 0; 713 | 714 | { 715 | const auto & hparams = model.hparams; 716 | 717 | const int n_embd = hparams.n_embd; 718 | const int n_layer = hparams.n_layer; 719 | const int n_ctx = hparams.n_ctx; 720 | const int n_vocab = hparams.n_vocab; 721 | 722 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings 723 | 724 | ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm 725 | 726 | ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output 727 | 728 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm 729 | 730 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq 731 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk 732 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv 733 | ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo 734 | 735 | ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm 736 | 737 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1 738 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2 739 | ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3 740 | 741 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k 742 | ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v 743 | 744 | ctx_size += (5 + 10*n_layer)*256; // object overhead 745 | 746 | #if DEBUG 747 | fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0)); 748 | #endif // DEBUG 749 | } 750 | 751 | // create the ggml context 752 | { 753 | struct ggml_init_params params = { 754 | /*.mem_size =*/ ctx_size, 755 | /*.mem_buffer =*/ NULL, 756 | }; 757 | 758 | model.ctx = ggml_init(params); 759 | if (!model.ctx) { 760 | #if DEBUG 761 | fprintf(stderr, "%s: ggml_init() failed\n", __func__); 762 | #endif // DEBUG 763 | return false; 764 | } 765 | } 766 | 767 | // prepare memory for the weights 768 | { 769 | const auto & hparams = model.hparams; 770 | 771 | const int n_embd = hparams.n_embd; 772 | const int n_layer = hparams.n_layer; 773 | const int n_ctx = hparams.n_ctx; 774 | const int n_vocab = hparams.n_vocab; 775 | 776 | model.layers.resize(n_layer); 777 | 778 | model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 779 | 780 | model.norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 781 | model.output = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); 782 | 783 | // map by name 784 | model.tensors["tok_embeddings.weight"] = model.tok_embeddings; 785 | 786 | model.tensors["norm.weight"] = model.norm; 787 | model.tensors["output.weight"] = model.output; 788 | 789 | for (int i = 0; i < n_layer; ++i) { 790 | auto & layer = model.layers[i]; 791 | 792 | layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 793 | 794 | layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 795 | layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 796 | layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 797 | layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); 798 | 799 | layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); 800 | 801 | layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); 802 | layer.w2 = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd); 803 | layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff); 804 | 805 | // map by name 806 | model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm; 807 | 808 | model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq; 809 | model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk; 810 | model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv; 811 | model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo; 812 | 813 | model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm; 814 | 815 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1; 816 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2; 817 | model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3; 818 | } 819 | } 820 | 821 | // key + value memory 822 | { 823 | const auto & hparams = model.hparams; 824 | 825 | const int n_embd = hparams.n_embd; 826 | const int n_layer = hparams.n_layer; 827 | const int n_ctx = hparams.n_ctx; 828 | 829 | const int n_mem = n_layer*n_ctx; 830 | const int n_elements = n_embd*n_mem; 831 | 832 | model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); 833 | model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); 834 | 835 | const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); 836 | 837 | #if DEBUG 838 | fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); 839 | #endif // DEBUG 840 | } 841 | 842 | const size_t file_offset = fin.tellg(); 843 | 844 | fin.close(); 845 | 846 | std::vector tmp; 847 | 848 | for (int i = 0; i < n_parts; ++i) { 849 | const int part_id = i; 850 | //const int part_id = n_parts - i - 1; 851 | 852 | std::string fname_part = fname; 853 | if (i > 0) { 854 | fname_part += "." + std::to_string(i); 855 | } 856 | 857 | #if DEBUG 858 | fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str()); 859 | #endif // DEBUG 860 | 861 | fin = std::ifstream(fname_part, std::ios::binary); 862 | fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size()); 863 | fin.seekg(file_offset); 864 | 865 | // load weights 866 | { 867 | int n_tensors = 0; 868 | size_t total_size = 0; 869 | 870 | #if DEBUG 871 | fprintf(stderr, "%s: ", __func__); 872 | #endif // DEBUG 873 | 874 | while (true) { 875 | int32_t n_dims; 876 | int32_t length; 877 | int32_t ftype; 878 | 879 | fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); 880 | fin.read(reinterpret_cast(&length), sizeof(length)); 881 | fin.read(reinterpret_cast(&ftype), sizeof(ftype)); 882 | 883 | if (fin.eof()) { 884 | break; 885 | } 886 | 887 | int32_t nelements = 1; 888 | int32_t ne[2] = { 1, 1 }; 889 | for (int i = 0; i < n_dims; ++i) { 890 | fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); 891 | nelements *= ne[i]; 892 | } 893 | 894 | std::string name(length, 0); 895 | fin.read(&name[0], length); 896 | 897 | if (model.tensors.find(name.data()) == model.tensors.end()) { 898 | #if DEBUG 899 | fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); 900 | #endif // DEBUG 901 | return false; 902 | } 903 | 904 | // split_type = 0: split by columns 905 | // split_type = 1: split by rows 906 | int split_type = 0; 907 | 908 | // split_type = 0: 909 | // regex: 910 | // - tok_embeddings.* 911 | // - layers.*.attention.wo.weight 912 | // - layers.*.feed_forward.w2.weight 913 | 914 | // split_type = 1: 915 | // regex: 916 | // - output.* 917 | // - layers.*.attention.wq.weight 918 | // - layers.*.attention.wk.weight 919 | // - layers.*.attention.wv.weight 920 | // - layers.*.feed_forward.w1.weight 921 | // - layers.*.feed_forward.w3.weight 922 | if (name.find("tok_embeddings") != std::string::npos) { 923 | split_type = 0; 924 | } else if (name.find("layers") != std::string::npos) { 925 | if (name.find("attention.wo.weight") != std::string::npos) { 926 | split_type = 0; 927 | } else if (name.find("feed_forward.w2.weight") != std::string::npos) { 928 | split_type = 0; 929 | } else { 930 | split_type = 1; 931 | } 932 | } else if (name.find("output") != std::string::npos) { 933 | split_type = 1; 934 | } 935 | 936 | auto tensor = model.tensors[name.data()]; 937 | 938 | if (n_dims == 1) { 939 | if (ggml_nelements(tensor) != nelements) { 940 | #if DEBUG 941 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 942 | #endif // DEBUG 943 | return false; 944 | } 945 | } else { 946 | if (ggml_nelements(tensor)/n_parts != nelements) { 947 | #if DEBUG 948 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); 949 | #endif // DEBUG 950 | return false; 951 | } 952 | } 953 | 954 | if (n_dims == 1) { 955 | if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { 956 | #if DEBUG 957 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 958 | __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); 959 | #endif // DEBUG 960 | return false; 961 | } 962 | } else { 963 | if (split_type == 0) { 964 | if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) { 965 | #if DEBUG 966 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 967 | __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]); 968 | #endif // DEBUG 969 | return false; 970 | } 971 | } else { 972 | if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) { 973 | #if DEBUG 974 | fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", 975 | __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]); 976 | #endif // DEBUG 977 | return false; 978 | } 979 | } 980 | } 981 | 982 | #if DEBUG 983 | if (0) { 984 | static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; 985 | fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type); 986 | } 987 | #endif // DEBUG 988 | 989 | size_t bpe = 0; 990 | 991 | switch (ftype) { 992 | case 0: bpe = ggml_type_size(GGML_TYPE_F32); break; 993 | case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; 994 | case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; 995 | case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; 996 | default: 997 | { 998 | #if DEBUG 999 | fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); 1000 | #endif // DEBUG 1001 | return false; 1002 | } 1003 | }; 1004 | 1005 | if (n_dims == 1 || n_parts == 1) { 1006 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { 1007 | #if DEBUG 1008 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 1009 | __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); 1010 | #endif // DEBUG 1011 | return false; 1012 | } 1013 | 1014 | if (part_id == 0) { 1015 | fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); 1016 | } else { 1017 | fin.seekg(ggml_nbytes(tensor), std::ios::cur); 1018 | } 1019 | 1020 | total_size += ggml_nbytes(tensor); 1021 | } else { 1022 | if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) { 1023 | #if DEBUG 1024 | fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", 1025 | __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe); 1026 | #endif // DEBUG 1027 | return false; 1028 | } 1029 | 1030 | if (split_type == 0) { 1031 | const int np0 = ne[0]; 1032 | 1033 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 1034 | assert(row_size == tensor->nb[1]); 1035 | 1036 | for (int i1 = 0; i1 < ne[1]; ++i1) { 1037 | const size_t offset_row = i1*row_size; 1038 | const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 1039 | fin.read(reinterpret_cast(tensor->data) + offset, row_size/n_parts); 1040 | } 1041 | } else { 1042 | const int np1 = ne[1]; 1043 | 1044 | const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type); 1045 | 1046 | for (int i1 = 0; i1 < ne[1]; ++i1) { 1047 | const size_t offset_row = (i1 + part_id*np1)*row_size; 1048 | fin.read(reinterpret_cast(tensor->data) + offset_row, row_size); 1049 | } 1050 | } 1051 | 1052 | total_size += ggml_nbytes(tensor)/n_parts; 1053 | } 1054 | 1055 | #if DEBUG 1056 | //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); 1057 | if (++n_tensors % 8 == 0) { 1058 | fprintf(stderr, "."); 1059 | fflush(stderr); 1060 | } 1061 | #endif // DEBUG 1062 | } 1063 | 1064 | #if DEBUG 1065 | fprintf(stderr, " done\n"); 1066 | 1067 | fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors); 1068 | #endif // DEBUG 1069 | } 1070 | 1071 | fin.close(); 1072 | } 1073 | 1074 | return true; 1075 | } 1076 | 1077 | // evaluate the transformer 1078 | // 1079 | // - model: the model 1080 | // - n_threads: number of threads to use 1081 | // - n_past: the context size so far 1082 | // - embd_inp: the embeddings of the tokens in the context 1083 | // - embd_w: the predicted logits for the next token 1084 | // 1085 | // The GPT-J model requires about 16MB of memory per input token. 1086 | // 1087 | bool llama_eval( 1088 | const llama_model & model, 1089 | const int n_threads, 1090 | const int n_past, 1091 | const std::vector & embd_inp, 1092 | std::vector & embd_w, 1093 | size_t & mem_per_token) { 1094 | const int N = (int)embd_inp.size(); 1095 | 1096 | const auto & hparams = model.hparams; 1097 | 1098 | const int n_embd = hparams.n_embd; 1099 | const int n_layer = hparams.n_layer; 1100 | const int n_ctx = hparams.n_ctx; 1101 | const int n_head = hparams.n_head; 1102 | const int n_vocab = hparams.n_vocab; 1103 | const int n_rot = hparams.n_embd/hparams.n_head; 1104 | 1105 | const int d_key = n_embd/n_head; 1106 | 1107 | // TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case 1108 | // static size_t buf_size = hparams.n_ctx*1024*1024; 1109 | static size_t buf_size = 512u*1024*1024; 1110 | static void * buf = malloc(buf_size); 1111 | 1112 | if (mem_per_token > 0 && mem_per_token*N > buf_size) { 1113 | const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead 1114 | //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); 1115 | 1116 | // reallocate 1117 | buf_size = buf_size_new; 1118 | buf = realloc(buf, buf_size); 1119 | if (buf == nullptr) { 1120 | #if DEBUG 1121 | fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size); 1122 | #endif // DEBUG 1123 | return false; 1124 | } 1125 | } 1126 | 1127 | struct ggml_init_params params = { 1128 | /*.mem_size =*/ buf_size, 1129 | /*.mem_buffer =*/ buf, 1130 | }; 1131 | 1132 | struct ggml_context * ctx0 = ggml_init(params); 1133 | ggml_cgraph gf = {}; 1134 | gf.n_threads = n_threads; 1135 | 1136 | struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); 1137 | memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); 1138 | 1139 | struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd); 1140 | 1141 | for (int il = 0; il < n_layer; ++il) { 1142 | struct ggml_tensor * inpSA = inpL; 1143 | 1144 | struct ggml_tensor * cur; 1145 | 1146 | // norm 1147 | { 1148 | cur = ggml_rms_norm(ctx0, inpL); 1149 | 1150 | // cur = attention_norm*cur 1151 | cur = ggml_mul(ctx0, 1152 | ggml_repeat(ctx0, model.layers[il].attention_norm, cur), 1153 | cur); 1154 | } 1155 | 1156 | // self-attention 1157 | { 1158 | struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); 1159 | struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); 1160 | struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); 1161 | 1162 | // store key and value to memory 1163 | if (N >= 1) { 1164 | struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); 1165 | struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); 1166 | 1167 | ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k)); 1168 | ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v)); 1169 | } 1170 | 1171 | // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) 1172 | struct ggml_tensor * Q = 1173 | ggml_permute(ctx0, 1174 | ggml_rope(ctx0, 1175 | ggml_cpy(ctx0, 1176 | Qcur, 1177 | ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), 1178 | n_past, n_rot, 0), 1179 | 0, 2, 1, 3); 1180 | 1181 | // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) 1182 | struct ggml_tensor * K = 1183 | ggml_permute(ctx0, 1184 | ggml_rope(ctx0, 1185 | ggml_reshape_3d(ctx0, 1186 | ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), 1187 | n_embd/n_head, n_head, n_past + N), 1188 | n_past, n_rot, 1), 1189 | 0, 2, 1, 3); 1190 | 1191 | // K * Q 1192 | struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); 1193 | 1194 | // KQ_scaled = KQ / sqrt(n_embd/n_head) 1195 | struct ggml_tensor * KQ_scaled = 1196 | ggml_scale(ctx0, 1197 | KQ, 1198 | ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head)) 1199 | ); 1200 | 1201 | // KQ_masked = mask_past(KQ_scaled) 1202 | struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); 1203 | 1204 | // KQ = soft_max(KQ_masked) 1205 | struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); 1206 | 1207 | // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() 1208 | struct ggml_tensor * V_trans = 1209 | ggml_permute(ctx0, 1210 | ggml_reshape_3d(ctx0, 1211 | ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), 1212 | n_embd/n_head, n_head, n_past + N), 1213 | 1, 2, 0, 3); 1214 | 1215 | // KQV = transpose(V) * KQ_soft_max 1216 | struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); 1217 | 1218 | // KQV_merged = KQV.permute(0, 2, 1, 3) 1219 | struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); 1220 | 1221 | // cur = KQV_merged.contiguous().view(n_embd, N) 1222 | cur = ggml_cpy(ctx0, 1223 | KQV_merged, 1224 | ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); 1225 | 1226 | // projection (no bias) 1227 | cur = ggml_mul_mat(ctx0, 1228 | model.layers[il].wo, 1229 | cur); 1230 | } 1231 | 1232 | struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); 1233 | 1234 | // feed-forward network 1235 | { 1236 | // norm 1237 | { 1238 | cur = ggml_rms_norm(ctx0, inpFF); 1239 | 1240 | // cur = ffn_norm*cur 1241 | cur = ggml_mul(ctx0, 1242 | ggml_repeat(ctx0, model.layers[il].ffn_norm, cur), 1243 | cur); 1244 | } 1245 | 1246 | struct ggml_tensor * tmp = ggml_mul_mat(ctx0, 1247 | model.layers[il].w3, 1248 | cur); 1249 | 1250 | 1251 | cur = ggml_mul_mat(ctx0, 1252 | model.layers[il].w1, 1253 | cur); 1254 | 1255 | // SILU activation 1256 | cur = ggml_silu(ctx0, cur); 1257 | 1258 | cur = ggml_mul(ctx0, cur, tmp); 1259 | 1260 | cur = ggml_mul_mat(ctx0, 1261 | model.layers[il].w2, 1262 | cur); 1263 | } 1264 | 1265 | cur = ggml_add(ctx0, cur, inpFF); 1266 | 1267 | // input for next layer 1268 | inpL = cur; 1269 | } 1270 | 1271 | // norm 1272 | { 1273 | inpL = ggml_rms_norm(ctx0, inpL); 1274 | 1275 | // inpL = norm*inpL 1276 | inpL = ggml_mul(ctx0, 1277 | ggml_repeat(ctx0, model.norm, inpL), 1278 | inpL); 1279 | } 1280 | 1281 | // lm_head 1282 | { 1283 | inpL = ggml_mul_mat(ctx0, model.output, inpL); 1284 | } 1285 | 1286 | // logits -> probs 1287 | //inpL = ggml_soft_max(ctx0, inpL); 1288 | 1289 | // run the computation 1290 | ggml_build_forward_expand(&gf, inpL); 1291 | ggml_graph_compute (ctx0, &gf); 1292 | 1293 | //if (n_past%100 == 0) { 1294 | // ggml_graph_print (&gf); 1295 | // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); 1296 | //} 1297 | 1298 | //embd_w.resize(n_vocab*N); 1299 | //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); 1300 | 1301 | // return result for just the last token 1302 | embd_w.resize(n_vocab); 1303 | memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); 1304 | 1305 | if (mem_per_token == 0) { 1306 | mem_per_token = ggml_used_mem(ctx0)/N; 1307 | } 1308 | //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0)); 1309 | 1310 | ggml_free(ctx0); 1311 | 1312 | return true; 1313 | } 1314 | --------------------------------------------------------------------------------