├── Applications
    ├── AlpacaChatApp
    │   ├── Resources
    │   │   ├── .gitignore
    │   │   └── Assets.xcassets
    │   │   │   ├── Contents.json
    │   │   │   ├── AppIcon.appiconset
    │   │   │       ├── Icon.png
    │   │   │       └── Contents.json
    │   │   │   └── AccentColor.colorset
    │   │   │       └── Contents.json
    │   ├── Configurations
    │   │   ├── .gitignore
    │   │   └── AlpacaChatApp.xcconfig
    │   ├── Preview Content
    │   │   └── Preview Assets.xcassets
    │   │   │   └── Contents.json
    │   ├── Sources
    │   │   ├── String.swift
    │   │   ├── AlpacaChatApp.swift
    │   │   ├── Message.swift
    │   │   ├── ChatView.swift
    │   │   ├── MessageView.swift
    │   │   └── ChatViewModel.swift
    │   └── Supporting Files
    │   │   └── AlpacaChatApp.entitlements
    ├── AlpacaChatCLI
    │   ├── .gitignore
    │   ├── Package.resolved
    │   ├── Package.swift
    │   └── Sources
    │   │   └── AlpacaChatCLI
    │   │       └── Command.swift
    └── AlpacaChatApp.xcodeproj
    │   ├── project.xcworkspace
    │       ├── contents.xcworkspacedata
    │       └── xcshareddata
    │       │   └── IDEWorkspaceChecks.plist
    │   ├── xcshareddata
    │       └── xcschemes
    │       │   └── AlpacaChatApp.xcscheme
    │   └── project.pbxproj
├── Resources
    └── AlpacaChat.png
├── .gitignore
├── Sources
    ├── alpaca.cpp
    │   ├── README.md
    │   ├── LICENSE
    │   ├── include
    │   │   ├── chat.h
    │   │   ├── utils.h
    │   │   └── ggml.h
    │   ├── utils.cpp
    │   └── chat.cpp
    ├── AlpacaChat
    │   ├── Model.swift
    │   └── Chat.swift
    └── AlpacaChatObjC
    │   ├── include
    │       ├── ALPChatModel.h
    │       └── ALPChat.h
    │   └── ALPChat.mm
├── Package.swift
├── LICENSE
└── README.md


/Applications/AlpacaChatApp/Resources/.gitignore:
--------------------------------------------------------------------------------
1 | *.bin
2 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Configurations/.gitignore:
--------------------------------------------------------------------------------
1 | Local.xcconfig
2 | 


--------------------------------------------------------------------------------
/Resources/AlpacaChat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niw/AlpacaChat/HEAD/Resources/AlpacaChat.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .swiftpm/config/registries.json
3 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
4 | xcuserdata/
5 | /.build
6 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Resources/Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 |   "info" : {
3 |     "author" : "xcode",
4 |     "version" : 1
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Preview Content/Preview Assets.xcassets/Contents.json:
--------------------------------------------------------------------------------
1 | {
2 |   "info" : {
3 |     "author" : "xcode",
4 |     "version" : 1
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatCLI/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .swiftpm/config/registries.json
3 | .swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata
4 | xcuserdata/
5 | /.build
6 | 


--------------------------------------------------------------------------------
/Sources/alpaca.cpp/README.md:
--------------------------------------------------------------------------------
1 | Alpaca.cpp
2 | ==========
3 | 
4 | See <https://github.com/antimatter15/alpaca.cpp> and
5 | original <https://github.com/ggerganov/llama.cpp> for details.
6 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Resources/Assets.xcassets/AppIcon.appiconset/Icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/niw/AlpacaChat/HEAD/Applications/AlpacaChatApp/Resources/Assets.xcassets/AppIcon.appiconset/Icon.png


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Sources/String.swift:
--------------------------------------------------------------------------------
 1 | //
 2 | //  String.swift
 3 | //  AlpacaChatApp
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/26/23.
 6 | //
 7 | 
 8 | import Foundation
 9 | 
10 | extension String: Error {
11 | }
12 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp.xcodeproj/project.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <Workspace
3 |    version = "1.0">
4 |    <FileRef
5 |       location = "self:">
6 |    </FileRef>
7 | </Workspace>
8 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
3 | <plist version="1.0">
4 | <dict>
5 | 	<key>IDEDidComputeMac32BitWarning</key>
6 | 	<true/>
7 | </dict>
8 | </plist>
9 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Resources/Assets.xcassets/AppIcon.appiconset/Contents.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "images" : [
 3 |     {
 4 |       "filename" : "Icon.png",
 5 |       "idiom" : "universal",
 6 |       "platform" : "ios",
 7 |       "size" : "1024x1024"
 8 |     }
 9 |   ],
10 |   "info" : {
11 |     "author" : "xcode",
12 |     "version" : 1
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Sources/AlpacaChatApp.swift:
--------------------------------------------------------------------------------
 1 | //
 2 | //  AlpacaChatApp.swift
 3 | //  AlpacaChatApp
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/18/23.
 6 | //
 7 | 
 8 | import SwiftUI
 9 | 
10 | @main
11 | struct AlpacaChatApp: App {
12 |     var body: some Scene {
13 |         WindowGroup {
14 |             ChatView()
15 |         }
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Resources/Assets.xcassets/AccentColor.colorset/Contents.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "colors" : [
 3 |     {
 4 |       "color" : {
 5 |         "platform" : "ios",
 6 |         "reference" : "systemOrangeColor"
 7 |       },
 8 |       "idiom" : "universal"
 9 |     }
10 |   ],
11 |   "info" : {
12 |     "author" : "xcode",
13 |     "version" : 1
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatCLI/Package.resolved:
--------------------------------------------------------------------------------
 1 | {
 2 |   "pins" : [
 3 |     {
 4 |       "identity" : "swift-argument-parser",
 5 |       "kind" : "remoteSourceControl",
 6 |       "location" : "https://github.com/apple/swift-argument-parser",
 7 |       "state" : {
 8 |         "revision" : "fee6933f37fde9a5e12a1e4aeaa93fe60116ff2a",
 9 |         "version" : "1.2.2"
10 |       }
11 |     }
12 |   ],
13 |   "version" : 2
14 | }
15 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Supporting Files/AlpacaChatApp.entitlements:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | <dict>
 5 | 	<key>com.apple.developer.kernel.extended-virtual-addressing</key>
 6 | 	<true/>
 7 | 	<key>com.apple.developer.kernel.increased-memory-limit</key>
 8 | 	<true/>
 9 | </dict>
10 | </plist>
11 | 


--------------------------------------------------------------------------------
/Sources/AlpacaChat/Model.swift:
--------------------------------------------------------------------------------
 1 | //
 2 | //  Model.swift
 3 | //  AlpacaChat
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/18/23.
 6 | //
 7 | 
 8 | import Foundation
 9 | import AlpacaChatObjC
10 | 
11 | public struct Model {
12 |     var model: ALPChatModel
13 | 
14 |     @available(macOS 10.15, iOS 13.0, watchOS 6.0, tvOS 13.0, *)
15 |     public static func load(from url: URL, contextSize: Int32 = 512, isLowMemory: Bool = false) async throws -> Self {
16 |         let model = try ALPChatModel.load(from: url, contextSize: contextSize, isLowMemory: isLowMemory)
17 |         return Model(model: model)
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Sources/Message.swift:
--------------------------------------------------------------------------------
 1 | //
 2 | //  Message.swift
 3 | //  AlpacaChatApp
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/20/23.
 6 | //
 7 | 
 8 | import Foundation
 9 | 
10 | struct Message: Identifiable {
11 |     enum State {
12 |         case none
13 |         case error
14 |         case typed
15 |         case predicting
16 |         case predicted(tokensPerSeconds: Double)
17 |     }
18 | 
19 |     enum Sender {
20 |         case user
21 |         case system
22 |     }
23 | 
24 |     var id = UUID()
25 |     var sender: Sender
26 |     var state: State = .none
27 |     var text: String
28 | }
29 | 


--------------------------------------------------------------------------------
/Sources/AlpacaChat/Chat.swift:
--------------------------------------------------------------------------------
 1 | //
 2 | //  Chat.swift
 3 | //  AlpacaChat
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/18/23.
 6 | //
 7 | 
 8 | import Foundation
 9 | import AlpacaChatObjC
10 | 
11 | public final class Chat {
12 |     private let chat: ALPChat
13 | 
14 |     public init(model: Model) {
15 |         chat = ALPChat(model: model.model)
16 |     }
17 | 
18 |     @available(macOS 10.15, iOS 13.0, watchOS 6.0, tvOS 13.0, *)
19 |     public func predictTokens(for prompt: String) -> AsyncThrowingStream<String, Error> {
20 |         AsyncThrowingStream { continuation in
21 |             chat.predictTokens(for: prompt) { token in
22 |                 continuation.yield(token)
23 |             } completionHandler: { error in
24 |                 continuation.finish(throwing: error)
25 |             }
26 |         }
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/Sources/AlpacaChatObjC/include/ALPChatModel.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  ALPChatModel.h
 3 | //  AlpacaChatObjC
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/16/23.
 6 | //
 7 | 
 8 | #import <Foundation/Foundation.h>
 9 | 
10 | NS_ASSUME_NONNULL_BEGIN
11 | 
12 | FOUNDATION_EXPORT NSString * const ALPChatModelErrorDomain;
13 | 
14 | NS_ENUM(NSUInteger, ALPChatModelErrorCode) {
15 |     ALPChatModelErrorCodeUnknown = 0,
16 |     ALPChatModelErrorCodeFailedToLoad
17 | };
18 | 
19 | @interface ALPChatModel : NSObject
20 | 
21 | + (nullable ALPChatModel *)loadFromURL:(NSURL *)URL
22 |                            contextSize:(int)contextSize
23 |                            isLowMemory:(BOOL)isLowMemory
24 |                                  error:(NSError * _Nullable * _Nullable)error;
25 | 
26 | - (instancetype)init NS_UNAVAILABLE;
27 | + (instancetype)new NS_UNAVAILABLE;
28 | 
29 | @end
30 | 
31 | NS_ASSUME_NONNULL_END
32 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Configurations/AlpacaChatApp.xcconfig:
--------------------------------------------------------------------------------
 1 | //
 2 | //  AlpacaChatApp.xcconfig
 3 | //  AlpacaChatApp
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/18/23.
 6 | //
 7 | 
 8 | // Place `Local.xcconfig` file which is git ignored in this `Configurations` directory
 9 | // can supply local specific configurations such as code signing identity.
10 | // For example, put next content as `Local.xcconfig`.
11 | // ```
12 | // CODE_SIGN_STYLE = Manual
13 | // CODE_SIGN_IDENTITY = iPhone Developer
14 | // DEVELOPMENT_TEAM = $(YOUR_DEVELOPMENT_TEAM_ID)
15 | // PRODUCT_BUNDLE_IDENTIFIER = $(YOUR_APP_BUNDLE_IDENTIFIER)
16 | // PROVISIONING_PROFILE_SPECIFIER = $(YOUR_PROFILE_NAME)
17 | // ```
18 | 
19 | // Default values, you can override it in `Local.xcconfig`.
20 | CODE_SIGN_STYLE = Manual
21 | CODE_SIGN_IDENTITY = iPhone Developer
22 | PRODUCT_BUNDLE_IDENTIFIER = at.niw.AlpacaChatApp
23 | 
24 | #include? "Local.xcconfig"
25 | 


--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version: 5.7
 2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
 3 | 
 4 | import PackageDescription
 5 | 
 6 | let package = Package(
 7 |     name: "AlpacaChat",
 8 |     products: [
 9 |         .library(
10 |             name: "AlpacaChat",
11 |             targets: [
12 |                 "AlpacaChat",
13 |                 "AlpacaChatObjC"
14 |             ]
15 |         )
16 |     ],
17 |     targets: [
18 |         .target(
19 |             name: "AlpacaChat",
20 |             dependencies: [
21 |                 .target(name: "AlpacaChatObjC")
22 |             ]
23 |         ),
24 |         .target(
25 |             name: "AlpacaChatObjC",
26 |             dependencies: [
27 |                 .target(name: "alpaca.cpp")
28 |             ]
29 |         ),
30 |         .target(
31 |             name: "alpaca.cpp"
32 |         )
33 |     ],
34 |     cxxLanguageStandard: .cxx11
35 | )
36 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatCLI/Package.swift:
--------------------------------------------------------------------------------
 1 | // swift-tools-version: 5.7
 2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
 3 | 
 4 | import PackageDescription
 5 | 
 6 | let package = Package(
 7 |     name: "AlpacaChatCLI",
 8 |     platforms: [
 9 |         .macOS(.v10_15)
10 |     ],
11 |     products: [
12 |         .executable(
13 |             name: "AlpacaChatCLI",
14 |             targets: [
15 |                 "AlpacaChatCLI"
16 |             ]
17 |         )
18 |     ],
19 |     dependencies: [
20 |         .package(name: "AlpacaChat", path: "../.."),
21 |         .package(url: "https://github.com/apple/swift-argument-parser", .upToNextMajor(from: "1.2.2"))
22 |     ],
23 |     targets: [
24 |         .executableTarget(
25 |             name: "AlpacaChatCLI",
26 |             dependencies: [
27 |                 .product(name: "ArgumentParser", package: "swift-argument-parser"),
28 |                 .product(name: "AlpacaChat", package: "AlpacaChat")
29 |             ]
30 |         ),
31 |     ]
32 | )
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2023 Yoshimasa Niwa
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining
 4 | a copy of this software and associated documentation files (the
 5 | "Software"), to deal in the Software without restriction, including
 6 | without limitation the rights to use, copy, modify, merge, publish,
 7 | distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so, subject to
 9 | the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be
12 | included in all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/Sources/AlpacaChatObjC/include/ALPChat.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  ALPChat.h
 3 | //  AlpacaChatObjC
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/16/23.
 6 | //
 7 | 
 8 | #import <Foundation/Foundation.h>
 9 | 
10 | NS_ASSUME_NONNULL_BEGIN
11 | 
12 | @class ALPChatModel;
13 | 
14 | FOUNDATION_EXPORT NSString * const ALPChatErrorDomain;
15 | 
16 | NS_ENUM(NSUInteger, ALPChatErrorCode) {
17 |     ALPChatErrorCodeUnknown = 0,
18 |     ALPChatErrorCodeCancelled,
19 |     ALPChatErrorCodeFailedToPredict,
20 |     ALPChatErrorCodeNoRemainingTokens,
21 | };
22 | 
23 | @protocol ALPChatCancellable <NSObject>
24 | 
25 | - (void)cancel;
26 | 
27 | @end
28 | 
29 | @interface ALPChat : NSObject
30 | 
31 | - (instancetype)initWithModel:(ALPChatModel *)model NS_DESIGNATED_INITIALIZER;
32 | - (instancetype)init NS_UNAVAILABLE;
33 | + (instancetype)new NS_UNAVAILABLE;
34 | 
35 | - (id<ALPChatCancellable>)predictTokensForPrompt:(NSString *)prompt
36 |                                     tokenHandler:(nullable void (^)(NSString *token))tokenHandler
37 |                                completionHandler:(nullable void (^)(NSError * _Nullable error))completionHandler
38 | NS_SWIFT_NAME(predictTokens(for:tokenHandler:completionHandler:));
39 | 
40 | @end
41 | 
42 | NS_ASSUME_NONNULL_END
43 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatCLI/Sources/AlpacaChatCLI/Command.swift:
--------------------------------------------------------------------------------
 1 | //
 2 | //  Command.swift
 3 | //  AlpacaChatCLI
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/18/23.
 6 | //
 7 | 
 8 | import Foundation
 9 | import AlpacaChat
10 | import ArgumentParser
11 | import Darwin
12 | 
13 | @main
14 | struct Command: AsyncParsableCommand {
15 |     @Option(name: .shortAndLong, help: "Path to model file.")
16 |     var modelPath: String
17 |     @Option(name: .shortAndLong, help: "Context size.")
18 |     var contextSize: Int32 = 2048
19 |     @Flag(name: .shortAndLong, help: "Use low memory model loading.")
20 |     var lowMemory: Bool = false
21 | 
22 |     mutating func run() async throws {
23 |         let modelURL = URL(fileURLWithPath: modelPath)
24 |         let model = try await Model.load(from: modelURL, contextSize: contextSize, isLowMemory: lowMemory)
25 |         let chat = Chat(model: model)
26 | 
27 |         while true {
28 |             print("> ", terminator: "")
29 |             guard let prompt = readLine() else {
30 |                 break
31 |             }
32 |             guard !prompt.isEmpty else {
33 |                 continue
34 |             }
35 | 
36 |             for try await token in chat.predictTokens(for: prompt) {
37 |                 print(token, terminator: "")
38 |                 fflush(stdout)
39 |             }
40 |             print("")
41 |         }
42 |     }
43 | }
44 | 


--------------------------------------------------------------------------------
/Sources/alpaca.cpp/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Georgi Gerganov
 4 | https://github.com/ggerganov/llama.cpp
 5 | 
 6 | Copyright (c) 2023 Kevin Kwok
 7 | https://github.com/antimatter15/alpaca.cpp
 8 | 
 9 | Copyright (c) 2023 Caize Wu
10 | https://github.com/Zepan/llama.cpp/commit/03ba421c74109b5bff297b207a1b47f8cc6fc05e
11 | 
12 | Permission is hereby granted, free of charge, to any person obtaining a copy
13 | of this software and associated documentation files (the "Software"), to deal
14 | in the Software without restriction, including without limitation the rights
15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 | copies of the Software, and to permit persons to whom the Software is
17 | furnished to do so, subject to the following conditions:
18 | 
19 | The above copyright notice and this permission notice shall be included in all
20 | copies or substantial portions of the Software.
21 | 
22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 | SOFTWARE.
29 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Sources/ChatView.swift:
--------------------------------------------------------------------------------
 1 | //
 2 | //  ChatView.swift
 3 | //  AlpacaChatApp
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/18/23.
 6 | //
 7 | 
 8 | import SwiftUI
 9 | 
10 | struct ChatView: View {
11 |     @StateObject
12 |     private var viewModel = ChatViewModel()
13 | 
14 |     @State
15 |     private var inputText: String = ""
16 | 
17 |     var body: some View {
18 |            VStack {
19 |                List {
20 |                    ForEach(viewModel.messages) { message in
21 |                        MessageView(message: message)
22 |                    }
23 |                    .listRowSeparator(.hidden)
24 |             }
25 |             HStack {
26 |                 switch viewModel.state {
27 |                 case .none, .loading:
28 |                     ProgressView {
29 |                         Text("Loading...")
30 |                     }
31 |                 case .completed:
32 |                     TextField("Type your message...", text: $inputText)
33 |                         .textFieldStyle(RoundedBorderTextFieldStyle())
34 |                     Button {
35 |                         Task {
36 |                             let text = inputText
37 |                             inputText = ""
38 |                             await viewModel.send(message: text)
39 |                         }
40 |                     } label: {
41 |                         Image(systemName: "arrow.up.circle.fill")
42 |                     }
43 |                     .padding(.horizontal, 6.0)
44 |                     .disabled(inputText.isEmpty)
45 |                 }
46 |             }
47 |             .padding(.all)
48 |         }
49 |         .navigationTitle("Chat")
50 |         .task {
51 |             await viewModel.prepare()
52 |         }
53 |     }
54 | }
55 | 
56 | struct ChatView_Previews: PreviewProvider {
57 |     static var previews: some View {
58 |         ChatView()
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | AlpacaChat
 2 | ==========
 3 | 
 4 | A Swift library that runs Alpaca-LoRA prediction locally
 5 | to implement ChatGPT like app on Apple platform devices.
 6 | 
 7 | ![AlpacaChat](Resources/AlpacaChat.png)
 8 | 
 9 | It is basically a wrapper for [alpaca.cpp](https://github.com/antimatter15/alpaca.cpp)
10 | that provides a simple Swift API for it.
11 | 
12 | ```swift
13 | import AlpacaChat
14 | 
15 | // Load model and instantiate a chat.
16 | let model = try await Model.load(from: URL(fileURLWithPath: "model.bin"))
17 | let chat = Chat(model: model)
18 | 
19 | // Ask users to get prompt.
20 | let prompt = readLine()!
21 | 
22 | // Run prediction and print tokens.
23 | for try await token in chat.predictTokens(for: prompt) {
24 |     print(token)
25 | }
26 | ```
27 | 
28 | 
29 | Model
30 | -----
31 | 
32 | Read [alpaca.cpp](https://github.com/antimatter15/alpaca.cpp),
33 | [alpaca-lora](https://github.com/tloen/alpaca-lora), and
34 | [llma.cpp](https://github.com/ggerganov/llama.cpp),
35 | then create 4-bits quantized `ggml` model bin file.
36 | 
37 | Place it in `/Applications/AlpacaChatApp/Resouces/model.bin` for example,
38 | and build app and run it.
39 | 
40 | 
41 | Usage
42 | -----
43 | 
44 | See actual command line and SwiftUI application for usages.
45 | 
46 | 
47 | Applications
48 | ------------
49 | 
50 | ### `/Applications/AlpacaChatCLI`
51 | 
52 | A command line chat app that can run on macOS.
53 | 
54 | To build, use Xcode or simply use `swift` command.
55 | 
56 | ```
57 | $ cd Applications/AlpacaChatCLI
58 | $ swift build -c release
59 | $ .build/release/AlpacaChatCLI -m /path/to/model.bin
60 | ```
61 | 
62 | ### `/Applications/AlpacaChatApp.xcodeproj`
63 | 
64 | A SwiftUI chat app that can run on iOS devices.
65 | 
66 | To build app runs on actual device, you need to create your own AppID
67 | and provisioning profile that allows extended memory usage with
68 | an entitlement.
69 | 
70 | Place `/Applications/AlpacaChatApp/Configurations/Local.xcconfig`
71 | to provide these your local development configurations for signing.
72 | 
73 | You may want to change scheme to use Release configuration for Run,
74 | or it may be seriously slow.
75 | 


--------------------------------------------------------------------------------
/Sources/alpaca.cpp/include/chat.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  chat.h
 3 | //  alpaca.cpp
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/16/23.
 6 | //
 7 | 
 8 | #pragma once
 9 | 
10 | #include <ggml.h>
11 | #include <utils.h>
12 | 
13 | #include <map>
14 | #include <string>
15 | #include <vector>
16 | 
17 | // default hparams (LLaMA 7B)
18 | struct llama_hparams {
19 |     int32_t n_vocab = 32000;
20 |     int32_t n_ctx   = 512;   // this is provided as user input?
21 |     int32_t n_embd  = 4096;
22 |     int32_t n_mult  = 256;
23 |     int32_t n_head  = 32;
24 |     int32_t n_layer = 32;
25 |     int32_t n_rot   = 64;
26 |     int32_t f16     = 1;
27 | };
28 | 
29 | struct llama_layer {
30 |     // normalization
31 |     struct ggml_tensor * attention_norm;
32 | 
33 |     // attention
34 |     struct ggml_tensor * wq;
35 |     struct ggml_tensor * wk;
36 |     struct ggml_tensor * wv;
37 |     struct ggml_tensor * wo;
38 | 
39 |     // normalization
40 |     struct ggml_tensor * ffn_norm;
41 | 
42 |     // ff
43 |     struct ggml_tensor * w1;
44 |     struct ggml_tensor * w2;
45 |     struct ggml_tensor * w3;
46 | };
47 | 
48 | struct mbuf_t {
49 |     mbuf_t(): buf(nullptr), size(0), p(nullptr), oft(0) {};
50 | 
51 |     char* buf;
52 |     size_t size;
53 |     char* p;
54 |     size_t oft;
55 | };
56 | 
57 | struct llama_model {
58 |     llama_hparams hparams;
59 | 
60 |     struct ggml_tensor * tok_embeddings;
61 | 
62 |     struct ggml_tensor * norm;
63 |     struct ggml_tensor * output;
64 | 
65 |     std::vector<llama_layer> layers;
66 | 
67 |     // key + value memory
68 |     struct ggml_tensor * memory_k;
69 |     struct ggml_tensor * memory_v;
70 | 
71 |     //
72 |     struct ggml_context * ctx;
73 |     std::map<std::string, struct ggml_tensor *> tensors;
74 | 
75 |     mbuf_t mbuf;
76 | };
77 | 
78 | void llma_model_unload(llama_model &model);
79 | 
80 | bool llama_model_load(const std::string &fname,
81 |                       llama_model &model,
82 |                       gpt_vocab &vocab,
83 |                       int n_ctx);
84 | 
85 | bool llama_model_load_lowmem(const std::string &fname,
86 |                              llama_model &model,
87 |                              gpt_vocab &vocab,
88 |                              int n_ctx);
89 | 
90 | bool llama_eval(const llama_model &model,
91 |                 const int n_threads,
92 |                 const int n_past,
93 |                 const std::vector<gpt_vocab::id> &embd_inp,
94 |                 std::vector<float> &embd_w,
95 |                 size_t &mem_per_token);
96 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Sources/MessageView.swift:
--------------------------------------------------------------------------------
 1 | //
 2 | //  MessageView.swift
 3 | //  AlpacaChatApp
 4 | //
 5 | //  Created by Yoshimasa Niwa on 3/20/23.
 6 | //
 7 | 
 8 | import SwiftUI
 9 | 
10 | struct MessageView: View {
11 |     var message: Message
12 | 
13 |     private struct SenderView: View {
14 |         var sender: Message.Sender
15 | 
16 |         var body: some View {
17 |             switch sender {
18 |             case .user:
19 |                 Text("You")
20 |                     .font(.caption)
21 |                     .foregroundColor(.accentColor)
22 |             case .system:
23 |                 Text("Alpaca")
24 |                     .font(.caption)
25 |                     .foregroundColor(.accentColor)
26 |             }
27 |         }
28 |     }
29 | 
30 |     private struct MessageContentView: View {
31 |         var message: Message
32 | 
33 |         var body: some View {
34 |             switch message.state {
35 |             case .none:
36 |                 ProgressView()
37 |             case .error:
38 |                 Text(message.text)
39 |                     .foregroundColor(Color.red)
40 |             case .typed:
41 |                 Text(message.text)
42 |             case .predicting:
43 |                 HStack {
44 |                     Text(message.text)
45 |                     ProgressView()
46 |                         .padding(.leading, 3.0)
47 |                 }
48 |             case .predicted(tokensPerSeconds: let tokenPerSeconds):
49 |                 VStack(alignment: .leading) {
50 |                     Text(message.text)
51 |                     Text(String(format: "%.2f tokens/s", tokenPerSeconds))
52 |                         .font(.footnote)
53 |                         .foregroundColor(Color.gray)
54 |                 }
55 |             }
56 |         }
57 |     }
58 | 
59 |     var body: some View {
60 |         HStack {
61 |             if message.sender == .user {
62 |                 Spacer()
63 |             }
64 | 
65 |             VStack(alignment: .leading, spacing: 6.0) {
66 |                 SenderView(sender: message.sender)
67 |                 MessageContentView(message: message)
68 |                     .padding(12.0)
69 |                     .background(Color.secondary.opacity(0.2))
70 |                     .cornerRadius(12.0)
71 |             }
72 | 
73 |             if message.sender == .system {
74 |                 Spacer()
75 |             }
76 |         }
77 |     }
78 | }
79 | 
80 | struct MessageView_Previews: PreviewProvider {
81 |     static var previews: some View {
82 |         VStack {
83 |             MessageView(message: Message(sender: .user, state: .none, text: "none"))
84 |             MessageView(message: Message(sender: .user, state: .error, text: "error"))
85 |             MessageView(message: Message(sender: .user, state: .predicting, text: "predicting"))
86 |             MessageView(message: Message(sender: .user, state: .predicted(tokensPerSeconds: 3.1415), text: "predicted"))
87 |         }
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp.xcodeproj/xcshareddata/xcschemes/AlpacaChatApp.xcscheme:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <Scheme
 3 |    LastUpgradeVersion = "1430"
 4 |    version = "1.7">
 5 |    <BuildAction
 6 |       parallelizeBuildables = "YES"
 7 |       buildImplicitDependencies = "YES">
 8 |       <BuildActionEntries>
 9 |          <BuildActionEntry
10 |             buildForTesting = "YES"
11 |             buildForRunning = "YES"
12 |             buildForProfiling = "YES"
13 |             buildForArchiving = "YES"
14 |             buildForAnalyzing = "YES">
15 |             <BuildableReference
16 |                BuildableIdentifier = "primary"
17 |                BlueprintIdentifier = "54B223E129C5CF9F006F4683"
18 |                BuildableName = "AlpacaChatApp.app"
19 |                BlueprintName = "AlpacaChatApp"
20 |                ReferencedContainer = "container:AlpacaChatApp.xcodeproj">
21 |             </BuildableReference>
22 |          </BuildActionEntry>
23 |       </BuildActionEntries>
24 |    </BuildAction>
25 |    <TestAction
26 |       buildConfiguration = "Debug"
27 |       selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
28 |       selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
29 |       shouldUseLaunchSchemeArgsEnv = "YES"
30 |       shouldAutocreateTestPlan = "YES">
31 |    </TestAction>
32 |    <LaunchAction
33 |       buildConfiguration = "Debug"
34 |       selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
35 |       selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
36 |       launchStyle = "0"
37 |       useCustomWorkingDirectory = "NO"
38 |       ignoresPersistentStateOnLaunch = "NO"
39 |       debugDocumentVersioning = "YES"
40 |       debugServiceExtension = "internal"
41 |       allowLocationSimulation = "YES">
42 |       <BuildableProductRunnable
43 |          runnableDebuggingMode = "0">
44 |          <BuildableReference
45 |             BuildableIdentifier = "primary"
46 |             BlueprintIdentifier = "54B223E129C5CF9F006F4683"
47 |             BuildableName = "AlpacaChatApp.app"
48 |             BlueprintName = "AlpacaChatApp"
49 |             ReferencedContainer = "container:AlpacaChatApp.xcodeproj">
50 |          </BuildableReference>
51 |       </BuildableProductRunnable>
52 |    </LaunchAction>
53 |    <ProfileAction
54 |       buildConfiguration = "Release"
55 |       shouldUseLaunchSchemeArgsEnv = "YES"
56 |       savedToolIdentifier = ""
57 |       useCustomWorkingDirectory = "NO"
58 |       debugDocumentVersioning = "YES">
59 |       <BuildableProductRunnable
60 |          runnableDebuggingMode = "0">
61 |          <BuildableReference
62 |             BuildableIdentifier = "primary"
63 |             BlueprintIdentifier = "54B223E129C5CF9F006F4683"
64 |             BuildableName = "AlpacaChatApp.app"
65 |             BlueprintName = "AlpacaChatApp"
66 |             ReferencedContainer = "container:AlpacaChatApp.xcodeproj">
67 |          </BuildableReference>
68 |       </BuildableProductRunnable>
69 |    </ProfileAction>
70 |    <AnalyzeAction
71 |       buildConfiguration = "Debug">
72 |    </AnalyzeAction>
73 |    <ArchiveAction
74 |       buildConfiguration = "Release"
75 |       revealArchiveInOrganizer = "YES">
76 |    </ArchiveAction>
77 | </Scheme>
78 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp/Sources/ChatViewModel.swift:
--------------------------------------------------------------------------------
  1 | //
  2 | //  ChatViewModel.swift
  3 | //  AlpacaChatApp
  4 | //
  5 | //  Created by Yoshimasa Niwa on 3/19/23.
  6 | //
  7 | 
  8 | import AlpacaChat
  9 | import Foundation
 10 | import os
 11 | 
 12 | private extension Duration {
 13 |     var seconds: Double {
 14 |         Double(components.seconds) + Double(components.attoseconds) / 1.0e18
 15 |     }
 16 | }
 17 | 
 18 | @MainActor
 19 | final class ChatViewModel: ObservableObject {
 20 |     enum State {
 21 |         case none
 22 |         case loading
 23 |         case completed
 24 |     }
 25 | 
 26 |     private var chat: Chat?
 27 | 
 28 |     @Published
 29 |     var state: State = .none
 30 | 
 31 |     @Published
 32 |     var messages: [Message] = []
 33 | 
 34 |     func prepare() async {
 35 |         guard chat == nil else {
 36 |             return
 37 |         }
 38 | 
 39 |         do {
 40 |             state = .loading
 41 |             guard let modelURL = Bundle.main.url(forResource: "model", withExtension: "bin") else {
 42 |                 throw "Model not found."
 43 |             }
 44 | 
 45 |             let contextSize: Int32
 46 |             let isLowMemory: Bool
 47 | #if targetEnvironment(simulator)
 48 |             contextSize = 2048
 49 |             isLowMemory = false
 50 | #else
 51 |             let memorySize = os_proc_available_memory()
 52 |             if memorySize > 6 * 1024 * 1024 * 1024 {
 53 |                 contextSize = 2048
 54 |                 isLowMemory = false
 55 |             } else {
 56 |                 contextSize = 512
 57 |                 isLowMemory = true
 58 |             }
 59 | #endif
 60 |             let model = try await Model.load(from: modelURL, contextSize: contextSize, isLowMemory: isLowMemory)
 61 |             chat = Chat(model: model)
 62 |         } catch {
 63 |             let message = Message(sender: .system, text: "Failed to load model.")
 64 |             messages.append(message)
 65 |         }
 66 |         state = .completed
 67 |     }
 68 | 
 69 |     func send(message text: String) async {
 70 |         let requestMessage = Message(sender: .user, state: .typed, text: text)
 71 |         messages.append(requestMessage)
 72 | 
 73 |         guard let chat = chat else {
 74 |             let message = Message(sender: .system, state: .error, text: "Chat is unavailable.")
 75 |             messages.append(message)
 76 |             return
 77 |         }
 78 | 
 79 |         do {
 80 |             var message = Message(sender: .system, text: "")
 81 |             messages.append(message)
 82 |             let messageIndex = messages.endIndex - 1
 83 | 
 84 |             var numberOfTokens = 0
 85 |             let duration = try await ContinuousClock().measure {
 86 |                 for try await token in chat.predictTokens(for: text) {
 87 |                     message.state = .predicting
 88 |                     message.text += token
 89 | 
 90 |                     var updatedMessages = messages
 91 |                     updatedMessages[messageIndex] = message
 92 |                     messages = updatedMessages
 93 | 
 94 |                     numberOfTokens += 1
 95 |                 }
 96 |             }
 97 |             message.state = .predicted(tokensPerSeconds: Double(numberOfTokens) / duration.seconds)
 98 |             messages[messageIndex] = message
 99 |         } catch {
100 |             let message = Message(sender: .system, state: .error, text: error.localizedDescription)
101 |             messages.append(message)
102 |         }
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/Sources/alpaca.cpp/include/utils.h:
--------------------------------------------------------------------------------
  1 | // Various helper functions and utilities
  2 | 
  3 | #pragma once
  4 | 
  5 | #include <string>
  6 | #include <map>
  7 | #include <vector>
  8 | #include <random>
  9 | #include <thread>
 10 | 
 11 | //
 12 | // CLI argument parsing
 13 | //
 14 | 
 15 | struct gpt_params {
 16 |     int32_t seed      = -1; // RNG seed
 17 |     int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
 18 |     int32_t n_predict = 128; // new tokens to predict
 19 |     int32_t repeat_last_n = 64;  // last n tokens to penalize
 20 |     int32_t n_ctx = 512; //context size
 21 |     
 22 |     // sampling parameters
 23 |     int32_t top_k = 40;
 24 |     float   top_p = 0.95f;
 25 |     float   temp  = 0.80f;
 26 |     float   repeat_penalty  = 1.30f;
 27 | 
 28 |     int32_t n_batch = 8; // batch size for prompt processing
 29 | 
 30 |     std::string model = "models/lamma-7B/ggml-model.bin"; // model path
 31 |     std::string prompt;
 32 | 
 33 |     bool use_color = false; // use color to distinguish generations and inputs
 34 | 
 35 |     bool interactive = false; // interactive mode
 36 |     bool interactive_start = false; // reverse prompt immediately
 37 |     std::string antiprompt = ""; // string upon seeing which more user input is prompted
 38 | };
 39 | 
 40 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 41 | 
 42 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 43 | 
 44 | std::string gpt_random_prompt(std::mt19937 & rng);
 45 | 
 46 | //
 47 | // Vocab utils
 48 | //
 49 | 
 50 | struct gpt_vocab {
 51 |     using id    = int32_t;
 52 |     using token = std::string;
 53 | 
 54 |     std::map<token, id> token_to_id;
 55 |     std::map<id, token> id_to_token;
 56 | };
 57 | 
 58 | void replace(std::string & str, const std::string & needle, const std::string & replacement);
 59 | 
 60 | // poor-man's JSON parsing
 61 | std::map<std::string, int32_t> json_parse(const std::string & fname);
 62 | 
 63 | // split text into tokens
 64 | //
 65 | // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
 66 | //
 67 | // Regex (Python):
 68 | // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 69 | //
 70 | // Regex (C++):
 71 | // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
 72 | //
 73 | std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 74 | 
 75 | // TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
 76 | // ref: https://github.com/google/sentencepiece
 77 | std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
 78 | 
 79 | // load the tokens from encoder.json
 80 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
 81 | 
 82 | // sample next token given probabilities for each embedding
 83 | //
 84 | //   - consider only the top K tokens
 85 | //   - from them, consider only the top tokens with cumulative probability > P
 86 | //
 87 | gpt_vocab::id llama_sample_top_p_top_k(
 88 |         const gpt_vocab & vocab,
 89 |         const float * logits,
 90 |         std::vector<gpt_vocab::id> & last_n_tokens,
 91 |         double repeat_penalty,
 92 |         int top_k,
 93 |         double top_p,
 94 |         double temp,
 95 |         std::mt19937 & rng);
 96 | 
 97 | // filer to top K tokens from list of logits
 98 | void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k);
 99 | 
100 | //
101 | // Quantization
102 | //
103 | 
104 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);
105 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);
106 | 


--------------------------------------------------------------------------------
/Sources/AlpacaChatObjC/ALPChat.mm:
--------------------------------------------------------------------------------
  1 | //
  2 | //  ALPChat.mm
  3 | //  AlpacaChatObjC
  4 | //
  5 | //  Created by Yoshimasa Niwa on 3/16/23.
  6 | //
  7 | 
  8 | #import "ALPChat.h"
  9 | #import "ALPChatModel.h"
 10 | 
 11 | #include <chat.h>
 12 | 
 13 | #include <atomic>
 14 | #include <vector>
 15 | 
 16 | NSString * const ALPChatModelErrorDomain = @"ALPChatModelErrorDomain";
 17 | 
 18 | @implementation ALPChatModel
 19 | {
 20 | @public
 21 |     llama_model _model;
 22 |     gpt_vocab _vocab;
 23 | }
 24 | 
 25 | + (ALPChatModel *)loadFromURL:(NSURL *)URL
 26 |                   contextSize:(int)contextSize
 27 |                   isLowMemory:(BOOL)isLowMemory
 28 |                         error:(NSError **)error
 29 | {
 30 |     gpt_vocab vocab;
 31 |     llama_model model;
 32 | 
 33 |     bool success = false;
 34 |     if (isLowMemory) {
 35 |         success = llama_model_load_lowmem(URL.fileSystemRepresentation, model, vocab, contextSize);
 36 |     } else {
 37 |         success = llama_model_load(URL.fileSystemRepresentation, model, vocab, contextSize);
 38 |     }
 39 |     if (!success) {
 40 |         if (error) {
 41 |             NSString * const failureReason = [[NSString alloc] initWithFormat:@"failed to load model: %@", URL];
 42 |             NSDictionary * const userInfo = @{
 43 |                 NSLocalizedFailureReasonErrorKey: failureReason
 44 |             };
 45 |             *error = [[NSError alloc] initWithDomain:ALPChatModelErrorDomain
 46 |                                                 code:ALPChatModelErrorCodeFailedToLoad
 47 |                                             userInfo:userInfo];
 48 |         }
 49 |         return nil;
 50 |     }
 51 | 
 52 |     return [[ALPChatModel alloc] initWithModel:model vocab:vocab];
 53 | }
 54 | 
 55 | - (instancetype)initWithModel:(const llama_model &)model vocab:(const gpt_vocab &)vocab
 56 | {
 57 |     if (self = [super init]) {
 58 |         _model = model;
 59 |         _vocab = vocab;
 60 |     }
 61 |     return self;
 62 | }
 63 | 
 64 | - (instancetype)init
 65 | {
 66 |     [self doesNotRecognizeSelector:_cmd];
 67 |     abort();
 68 | }
 69 | 
 70 | - (void)dealloc
 71 | {
 72 |     llma_model_unload(_model);
 73 | }
 74 | 
 75 | @end
 76 | 
 77 | // MARK: -
 78 | 
 79 | @interface ALPChatPredicationCancellable : NSObject <ALPChatCancellable>
 80 | 
 81 | @end
 82 | 
 83 | @implementation ALPChatPredicationCancellable
 84 | {
 85 | @public
 86 |     std::atomic<bool> _cancelled;
 87 | }
 88 | 
 89 | - (instancetype)init
 90 | {
 91 |     if (self = [super init]) {
 92 |         _cancelled.store(false);
 93 |     }
 94 |     return self;
 95 | }
 96 | 
 97 | - (void)cancel
 98 | {
 99 |     _cancelled.store(true);
100 | }
101 | 
102 | @end
103 | 
104 | // MARK: -
105 | 
106 | NSString * const ALPChatErrorDomain = @"ALPChatErrorDomain";
107 | 
108 | @implementation ALPChat
109 | {
110 |     ALPChatModel *_model;
111 |     dispatch_queue_t _workerQueue;
112 | 
113 |     gpt_params _params;
114 | 
115 |     std::mt19937 _rng;
116 | 
117 |     int _n_past;
118 |     int _n_remaining_tokens;
119 | 
120 |     std::vector<gpt_vocab::id> _request_tokens;
121 |     std::vector<gpt_vocab::id> _response_tokens;
122 | 
123 |     std::vector<gpt_vocab::id> _embd;
124 |     std::vector<gpt_vocab::id> _last_n_tokens;
125 | 
126 |     std::vector<float> _logits;
127 |     size_t _mem_per_token;
128 | 
129 |     bool _prepared;
130 | }
131 | 
132 | - (instancetype)initWithModel:(ALPChatModel *)model
133 | {
134 |     if (self = [super init]) {
135 |         _model = model;
136 |         _workerQueue = dispatch_queue_create("ALPChat.workerQueue", DISPATCH_QUEUE_SERIAL_WITH_AUTORELEASE_POOL);
137 |         dispatch_async(_workerQueue, ^{
138 |             [self _alp_worker_initialize];
139 |         });
140 |     }
141 |     return self;
142 | }
143 | 
144 | - (instancetype)init
145 | {
146 |     [self doesNotRecognizeSelector:_cmd];
147 |     abort();
148 | }
149 | 
150 | - (void)_alp_worker_initialize
151 | {
152 |     // Use mostly default values.
153 |     _params.temp = 0.1f;
154 |     _params.n_threads = (int32_t)std::thread::hardware_concurrency();
155 | #if DEBUG
156 |     fprintf(stderr, "%s: hardware concurrency = %d\n", __func__, (int32_t) std::thread::hardware_concurrency());
157 |     fprintf(stderr, "%s: n_threads = %d\n", __func__, _params.n_threads);
158 | #endif // DEBUG
159 | 
160 |     const int32_t seed = (int32_t)time(NULL);
161 |     _rng = std::mt19937(seed);
162 | 
163 |     _n_past = 0;
164 |     _n_remaining_tokens = 0;
165 | 
166 |     _request_tokens = ::llama_tokenize(_model->_vocab, "## Instruction:\n\n", true);
167 |     _response_tokens = ::llama_tokenize(_model->_vocab, "\n## Response:\n\n", false);
168 | 
169 |     _last_n_tokens = std::vector<gpt_vocab::id>(_params.repeat_last_n);
170 |     std::fill(_last_n_tokens.begin(), _last_n_tokens.end(), 0);
171 | }
172 | 
173 | - (id<ALPChatCancellable>)predictTokensForPrompt:(NSString *)prompt
174 |                                     tokenHandler:(nullable void (^)(NSString *token))tokenHandler
175 |                                completionHandler:(nullable void (^)(NSError * _Nullable error))completionHandler
176 | {
177 |     ALPChatPredicationCancellable * const cancellable = [[ALPChatPredicationCancellable alloc] init];
178 |     dispatch_async(_workerQueue, ^{
179 |         [self _alp_worker_predictTokensForPrompt:prompt
180 |                                     tokenHandler:tokenHandler
181 |                                completionHandler:completionHandler
182 |                                      cancellable:cancellable];
183 |     });
184 |     return cancellable;
185 | }
186 | 
187 | - (void)_alp_worker_predictTokensForPrompt:(NSString *)prompt
188 |                               tokenHandler:(nullable void (^)(NSString *token))tokenHandler
189 |                          completionHandler:(nullable void (^)(NSError * _Nullable error))completionHandler
190 |                                cancellable:(ALPChatPredicationCancellable *)cancellable
191 | {
192 |     std::vector<gpt_vocab::id> input_tokens;
193 | 
194 |     if (!_prepared) {
195 |         // Determine the required inference memory per token.
196 |         // This takes some duration.
197 |         llama_eval(_model->_model, _params.n_threads, 0, { 0, 1, 2, 3 }, _logits, _mem_per_token);
198 | 
199 |         // We may want to slide the input window along with the context,
200 |         // but for now we restrict to the context length.
201 |         _n_remaining_tokens = _model->_model.hparams.n_ctx;
202 | 
203 |         _prepared = true;
204 |     }
205 | 
206 |     input_tokens.insert(input_tokens.end(), _request_tokens.begin(), _request_tokens.end());
207 | 
208 |     const char * const promptCString = [prompt cStringUsingEncoding:NSUTF8StringEncoding];
209 |     std::vector<gpt_vocab::id> prompt_tokens = ::llama_tokenize(_model->_vocab, promptCString, false);
210 |     input_tokens.insert(input_tokens.end(), prompt_tokens.begin(), prompt_tokens.end());
211 |     input_tokens.insert(input_tokens.end(), _response_tokens.begin(), _response_tokens.end());
212 | 
213 |     _n_remaining_tokens -= _request_tokens.size() + prompt_tokens.size() + _response_tokens.size();
214 | 
215 |     int n_consumed_input_tokens = 0;
216 |     bool is_input_tokens_consumed = false;
217 | 
218 |     while (_n_remaining_tokens > 0) {
219 |         if (cancellable->_cancelled.load()) {
220 |             if (completionHandler) {
221 |                 NSError * const error = [[NSError alloc] initWithDomain:ALPChatErrorDomain
222 |                                                                    code:ALPChatErrorCodeCancelled
223 |                                                                userInfo:nil];
224 |                 completionHandler(error);
225 |             }
226 |             return;
227 |         }
228 | #if DEBUG
229 |         fprintf(stderr, "\nremaining_tokens = %d\n", _n_remaining_tokens);
230 | #endif // DEBUG
231 | 
232 |         // Predict
233 |         if (_embd.size() > 0) {
234 | #if DEBUG
235 |             const int64_t t_start_sample_us = ggml_time_us();
236 |             fprintf(stderr, "start predicting...\n");
237 | #endif // DEBUG
238 |             if (!llama_eval(_model->_model, _params.n_threads, _n_past, _embd, _logits, _mem_per_token)) {
239 |                 if (completionHandler) {
240 |                     NSError * const error = [[NSError alloc] initWithDomain:ALPChatErrorDomain
241 |                                                                        code:ALPChatErrorCodeFailedToPredict
242 |                                                                    userInfo:nil];
243 |                     completionHandler(error);
244 |                 }
245 |                 return;
246 |             }
247 | #if DEBUG
248 |             fprintf(stderr, "done %8.2f ms\n", (ggml_time_us() - t_start_sample_us) / 1000.0f);
249 | #endif // DEBUG
250 |         }
251 | 
252 |         _n_past += _embd.size();
253 |         _embd.clear();
254 | 
255 |         if (n_consumed_input_tokens >= input_tokens.size()) {
256 |             is_input_tokens_consumed = true;
257 |         }
258 | 
259 |         if (is_input_tokens_consumed) {
260 |             const float top_k = _params.top_k;
261 |             const float top_p = _params.top_p;
262 |             const float temp  = _params.temp;
263 |             const float repeat_penalty = _params.repeat_penalty;
264 | 
265 |             const int n_vocab = _model->_model.hparams.n_vocab;
266 | 
267 |             gpt_vocab::id ident = llama_sample_top_p_top_k(_model->_vocab, _logits.data() + (_logits.size() - n_vocab), _last_n_tokens, repeat_penalty, top_k, top_p, temp, _rng);
268 | 
269 |             _last_n_tokens.erase(_last_n_tokens.begin());
270 |             _last_n_tokens.push_back(ident);
271 | 
272 |             // add it to the context
273 |             _embd.push_back(ident);
274 | 
275 |             // decrement remaining sampling budget
276 |             --_n_remaining_tokens;
277 |         } else {
278 |             while (n_consumed_input_tokens < input_tokens.size()) {
279 | #if DEBUG
280 |                 fprintf(stderr, "%6d -> '%s'\n", input_tokens[n_consumed_input_tokens], _model->_vocab.id_to_token.at(input_tokens[n_consumed_input_tokens]).c_str());
281 | #endif // DEBUG
282 | 
283 |                 _embd.push_back(input_tokens[n_consumed_input_tokens]);
284 | 
285 |                 _last_n_tokens.erase(_last_n_tokens.begin());
286 |                 _last_n_tokens.push_back(input_tokens[n_consumed_input_tokens]);
287 |                 ++n_consumed_input_tokens;
288 | 
289 |                 if (_embd.size() > _params.n_batch) {
290 |                     break;
291 |                 }
292 |             }
293 |         }
294 | 
295 | #if DEBUG
296 |         {
297 | #else
298 |         if (is_input_tokens_consumed) {
299 | #endif // DEBUG
300 |             for (auto ident : _embd) {
301 |                 const char *tokenCString = _model->_vocab.id_to_token[ident].c_str();
302 | #if DEBUG
303 |                 printf("%s", tokenCString);
304 | 
305 |                 if (is_input_tokens_consumed) {
306 | #endif // DEBUG
307 |                     if (tokenHandler) {
308 |                         NSString * const tokenString = [[NSString alloc] initWithUTF8String:tokenCString];
309 |                         tokenHandler(tokenString);
310 |                     }
311 | #if DEBUG
312 |                 }
313 | #endif // DEBUG
314 |             }
315 | #if DEBUG
316 |             fflush(stdout);
317 | #endif // DEBUG
318 |         }
319 | 
320 |         if (_embd.size() > 0 && _embd.back() == 2) {
321 | #if DEBUG
322 |             fprintf(stderr, " [end of text]\n");
323 | #endif // DEBUG
324 |             if (completionHandler) {
325 |                 completionHandler(nil);
326 |             }
327 |             return;
328 |         }
329 |     }
330 | 
331 |     if (completionHandler) {
332 |         NSError * const error = [[NSError alloc] initWithDomain:ALPChatErrorDomain
333 |                                                            code:ALPChatErrorCodeNoRemainingTokens
334 |                                                        userInfo:nil];
335 |         completionHandler(error);
336 |     }
337 | }
338 | 
339 | @end
340 | 


--------------------------------------------------------------------------------
/Applications/AlpacaChatApp.xcodeproj/project.pbxproj:
--------------------------------------------------------------------------------
  1 | // !$*UTF8*$!
  2 | {
  3 | 	archiveVersion = 1;
  4 | 	classes = {
  5 | 	};
  6 | 	objectVersion = 56;
  7 | 	objects = {
  8 | 
  9 | /* Begin PBXBuildFile section */
 10 | 		5423912729C5D1840041C234 /* AlpacaChatApp.xcconfig in Resources */ = {isa = PBXBuildFile; fileRef = 5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */; };
 11 | 		54601CFB29C701F900E459DD /* ChatViewModel.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54601CFA29C701F900E459DD /* ChatViewModel.swift */; };
 12 | 		5498C52229D10E0A0090856F /* String.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5498C52129D10E0A0090856F /* String.swift */; };
 13 | 		54B223E629C5CF9F006F4683 /* AlpacaChatApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54B223E529C5CF9F006F4683 /* AlpacaChatApp.swift */; };
 14 | 		54B223E829C5CF9F006F4683 /* ChatView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54B223E729C5CF9F006F4683 /* ChatView.swift */; };
 15 | 		54B223EA29C5CF9F006F4683 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 54B223E929C5CF9F006F4683 /* Assets.xcassets */; };
 16 | 		54B223ED29C5CF9F006F4683 /* Preview Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 54B223EC29C5CF9F006F4683 /* Preview Assets.xcassets */; };
 17 | 		54B223FC29C5D075006F4683 /* AlpacaChat in Frameworks */ = {isa = PBXBuildFile; productRef = 54B223FB29C5D075006F4683 /* AlpacaChat */; };
 18 | 		54B26D2F29C7D81A00A9AF05 /* model.bin in Resources */ = {isa = PBXBuildFile; fileRef = 54B26D2E29C7D81A00A9AF05 /* model.bin */; };
 19 | 		54E9B23829C97AEC00958DFE /* Message.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54E9B23629C97AEC00958DFE /* Message.swift */; };
 20 | 		54E9B23929C97AEC00958DFE /* MessageView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 54E9B23729C97AEC00958DFE /* MessageView.swift */; };
 21 | /* End PBXBuildFile section */
 22 | 
 23 | /* Begin PBXFileReference section */
 24 | 		5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.xcconfig; path = AlpacaChatApp.xcconfig; sourceTree = "<group>"; };
 25 | 		54601CFA29C701F900E459DD /* ChatViewModel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatViewModel.swift; sourceTree = "<group>"; };
 26 | 		5498C52129D10E0A0090856F /* String.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = String.swift; sourceTree = "<group>"; };
 27 | 		54B223E229C5CF9F006F4683 /* AlpacaChatApp.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = AlpacaChatApp.app; sourceTree = BUILT_PRODUCTS_DIR; };
 28 | 		54B223E529C5CF9F006F4683 /* AlpacaChatApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AlpacaChatApp.swift; sourceTree = "<group>"; };
 29 | 		54B223E729C5CF9F006F4683 /* ChatView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ChatView.swift; sourceTree = "<group>"; };
 30 | 		54B223E929C5CF9F006F4683 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 31 | 		54B223EC29C5CF9F006F4683 /* Preview Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = "Preview Assets.xcassets"; sourceTree = "<group>"; };
 32 | 		54B223F629C5CFFA006F4683 /* AlpacaChat */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = AlpacaChat; path = ..; sourceTree = "<group>"; };
 33 | 		54B26D2D29C7CA7E00A9AF05 /* AlpacaChatApp.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = AlpacaChatApp.entitlements; sourceTree = "<group>"; };
 34 | 		54B26D2E29C7D81A00A9AF05 /* model.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; path = model.bin; sourceTree = "<group>"; };
 35 | 		54E9B23629C97AEC00958DFE /* Message.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Message.swift; sourceTree = "<group>"; };
 36 | 		54E9B23729C97AEC00958DFE /* MessageView.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = MessageView.swift; sourceTree = "<group>"; };
 37 | /* End PBXFileReference section */
 38 | 
 39 | /* Begin PBXFrameworksBuildPhase section */
 40 | 		54B223DF29C5CF9F006F4683 /* Frameworks */ = {
 41 | 			isa = PBXFrameworksBuildPhase;
 42 | 			buildActionMask = 2147483647;
 43 | 			files = (
 44 | 				54B223FC29C5D075006F4683 /* AlpacaChat in Frameworks */,
 45 | 			);
 46 | 			runOnlyForDeploymentPostprocessing = 0;
 47 | 		};
 48 | /* End PBXFrameworksBuildPhase section */
 49 | 
 50 | /* Begin PBXGroup section */
 51 | 		5423912529C5D1530041C234 /* Configurations */ = {
 52 | 			isa = PBXGroup;
 53 | 			children = (
 54 | 				5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */,
 55 | 			);
 56 | 			path = Configurations;
 57 | 			sourceTree = "<group>";
 58 | 		};
 59 | 		54B223D929C5CF9F006F4683 = {
 60 | 			isa = PBXGroup;
 61 | 			children = (
 62 | 				54B223E429C5CF9F006F4683 /* AlpacaChatApp */,
 63 | 				54B223F529C5CFFA006F4683 /* Packages */,
 64 | 				54B223E329C5CF9F006F4683 /* Products */,
 65 | 				54B223FA29C5D075006F4683 /* Frameworks */,
 66 | 			);
 67 | 			sourceTree = "<group>";
 68 | 		};
 69 | 		54B223E329C5CF9F006F4683 /* Products */ = {
 70 | 			isa = PBXGroup;
 71 | 			children = (
 72 | 				54B223E229C5CF9F006F4683 /* AlpacaChatApp.app */,
 73 | 			);
 74 | 			name = Products;
 75 | 			sourceTree = "<group>";
 76 | 		};
 77 | 		54B223E429C5CF9F006F4683 /* AlpacaChatApp */ = {
 78 | 			isa = PBXGroup;
 79 | 			children = (
 80 | 				5423912529C5D1530041C234 /* Configurations */,
 81 | 				54B223EB29C5CF9F006F4683 /* Preview Content */,
 82 | 				54B223F429C5CFC6006F4683 /* Resources */,
 83 | 				54B223F329C5CFBF006F4683 /* Sources */,
 84 | 				54B26D2C29C7CA6400A9AF05 /* Supporting Files */,
 85 | 			);
 86 | 			path = AlpacaChatApp;
 87 | 			sourceTree = "<group>";
 88 | 		};
 89 | 		54B223EB29C5CF9F006F4683 /* Preview Content */ = {
 90 | 			isa = PBXGroup;
 91 | 			children = (
 92 | 				54B223EC29C5CF9F006F4683 /* Preview Assets.xcassets */,
 93 | 			);
 94 | 			path = "Preview Content";
 95 | 			sourceTree = "<group>";
 96 | 		};
 97 | 		54B223F329C5CFBF006F4683 /* Sources */ = {
 98 | 			isa = PBXGroup;
 99 | 			children = (
100 | 				54B223E529C5CF9F006F4683 /* AlpacaChatApp.swift */,
101 | 				54B223E729C5CF9F006F4683 /* ChatView.swift */,
102 | 				54601CFA29C701F900E459DD /* ChatViewModel.swift */,
103 | 				54E9B23629C97AEC00958DFE /* Message.swift */,
104 | 				54E9B23729C97AEC00958DFE /* MessageView.swift */,
105 | 				5498C52129D10E0A0090856F /* String.swift */,
106 | 			);
107 | 			path = Sources;
108 | 			sourceTree = "<group>";
109 | 		};
110 | 		54B223F429C5CFC6006F4683 /* Resources */ = {
111 | 			isa = PBXGroup;
112 | 			children = (
113 | 				54B26D2E29C7D81A00A9AF05 /* model.bin */,
114 | 				54B223E929C5CF9F006F4683 /* Assets.xcassets */,
115 | 			);
116 | 			path = Resources;
117 | 			sourceTree = "<group>";
118 | 		};
119 | 		54B223F529C5CFFA006F4683 /* Packages */ = {
120 | 			isa = PBXGroup;
121 | 			children = (
122 | 				54B223F629C5CFFA006F4683 /* AlpacaChat */,
123 | 			);
124 | 			name = Packages;
125 | 			sourceTree = "<group>";
126 | 		};
127 | 		54B223FA29C5D075006F4683 /* Frameworks */ = {
128 | 			isa = PBXGroup;
129 | 			children = (
130 | 			);
131 | 			name = Frameworks;
132 | 			sourceTree = "<group>";
133 | 		};
134 | 		54B26D2C29C7CA6400A9AF05 /* Supporting Files */ = {
135 | 			isa = PBXGroup;
136 | 			children = (
137 | 				54B26D2D29C7CA7E00A9AF05 /* AlpacaChatApp.entitlements */,
138 | 			);
139 | 			path = "Supporting Files";
140 | 			sourceTree = "<group>";
141 | 		};
142 | /* End PBXGroup section */
143 | 
144 | /* Begin PBXNativeTarget section */
145 | 		54B223E129C5CF9F006F4683 /* AlpacaChatApp */ = {
146 | 			isa = PBXNativeTarget;
147 | 			buildConfigurationList = 54B223F029C5CF9F006F4683 /* Build configuration list for PBXNativeTarget "AlpacaChatApp" */;
148 | 			buildPhases = (
149 | 				54B223DE29C5CF9F006F4683 /* Sources */,
150 | 				54B223DF29C5CF9F006F4683 /* Frameworks */,
151 | 				54B223E029C5CF9F006F4683 /* Resources */,
152 | 			);
153 | 			buildRules = (
154 | 			);
155 | 			dependencies = (
156 | 			);
157 | 			name = AlpacaChatApp;
158 | 			packageProductDependencies = (
159 | 				54B223FB29C5D075006F4683 /* AlpacaChat */,
160 | 			);
161 | 			productName = AlpacaChatApp;
162 | 			productReference = 54B223E229C5CF9F006F4683 /* AlpacaChatApp.app */;
163 | 			productType = "com.apple.product-type.application";
164 | 		};
165 | /* End PBXNativeTarget section */
166 | 
167 | /* Begin PBXProject section */
168 | 		54B223DA29C5CF9F006F4683 /* Project object */ = {
169 | 			isa = PBXProject;
170 | 			attributes = {
171 | 				BuildIndependentTargetsInParallel = 1;
172 | 				LastSwiftUpdateCheck = 1430;
173 | 				LastUpgradeCheck = 1430;
174 | 				TargetAttributes = {
175 | 					54B223E129C5CF9F006F4683 = {
176 | 						CreatedOnToolsVersion = 14.3;
177 | 					};
178 | 				};
179 | 			};
180 | 			buildConfigurationList = 54B223DD29C5CF9F006F4683 /* Build configuration list for PBXProject "AlpacaChatApp" */;
181 | 			compatibilityVersion = "Xcode 14.0";
182 | 			developmentRegion = en;
183 | 			hasScannedForEncodings = 0;
184 | 			knownRegions = (
185 | 				en,
186 | 				Base,
187 | 			);
188 | 			mainGroup = 54B223D929C5CF9F006F4683;
189 | 			productRefGroup = 54B223E329C5CF9F006F4683 /* Products */;
190 | 			projectDirPath = "";
191 | 			projectRoot = "";
192 | 			targets = (
193 | 				54B223E129C5CF9F006F4683 /* AlpacaChatApp */,
194 | 			);
195 | 		};
196 | /* End PBXProject section */
197 | 
198 | /* Begin PBXResourcesBuildPhase section */
199 | 		54B223E029C5CF9F006F4683 /* Resources */ = {
200 | 			isa = PBXResourcesBuildPhase;
201 | 			buildActionMask = 2147483647;
202 | 			files = (
203 | 				54B223ED29C5CF9F006F4683 /* Preview Assets.xcassets in Resources */,
204 | 				54B26D2F29C7D81A00A9AF05 /* model.bin in Resources */,
205 | 				54B223EA29C5CF9F006F4683 /* Assets.xcassets in Resources */,
206 | 				5423912729C5D1840041C234 /* AlpacaChatApp.xcconfig in Resources */,
207 | 			);
208 | 			runOnlyForDeploymentPostprocessing = 0;
209 | 		};
210 | /* End PBXResourcesBuildPhase section */
211 | 
212 | /* Begin PBXSourcesBuildPhase section */
213 | 		54B223DE29C5CF9F006F4683 /* Sources */ = {
214 | 			isa = PBXSourcesBuildPhase;
215 | 			buildActionMask = 2147483647;
216 | 			files = (
217 | 				5498C52229D10E0A0090856F /* String.swift in Sources */,
218 | 				54E9B23929C97AEC00958DFE /* MessageView.swift in Sources */,
219 | 				54B223E829C5CF9F006F4683 /* ChatView.swift in Sources */,
220 | 				54E9B23829C97AEC00958DFE /* Message.swift in Sources */,
221 | 				54601CFB29C701F900E459DD /* ChatViewModel.swift in Sources */,
222 | 				54B223E629C5CF9F006F4683 /* AlpacaChatApp.swift in Sources */,
223 | 			);
224 | 			runOnlyForDeploymentPostprocessing = 0;
225 | 		};
226 | /* End PBXSourcesBuildPhase section */
227 | 
228 | /* Begin XCBuildConfiguration section */
229 | 		54B223EE29C5CF9F006F4683 /* Debug */ = {
230 | 			isa = XCBuildConfiguration;
231 | 			buildSettings = {
232 | 				ALWAYS_SEARCH_USER_PATHS = NO;
233 | 				CLANG_ANALYZER_NONNULL = YES;
234 | 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
235 | 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
236 | 				CLANG_ENABLE_MODULES = YES;
237 | 				CLANG_ENABLE_OBJC_ARC = YES;
238 | 				CLANG_ENABLE_OBJC_WEAK = YES;
239 | 				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
240 | 				CLANG_WARN_BOOL_CONVERSION = YES;
241 | 				CLANG_WARN_COMMA = YES;
242 | 				CLANG_WARN_CONSTANT_CONVERSION = YES;
243 | 				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
244 | 				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
245 | 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
246 | 				CLANG_WARN_EMPTY_BODY = YES;
247 | 				CLANG_WARN_ENUM_CONVERSION = YES;
248 | 				CLANG_WARN_INFINITE_RECURSION = YES;
249 | 				CLANG_WARN_INT_CONVERSION = YES;
250 | 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
251 | 				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
252 | 				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
253 | 				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
254 | 				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
255 | 				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
256 | 				CLANG_WARN_STRICT_PROTOTYPES = YES;
257 | 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
258 | 				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
259 | 				CLANG_WARN_UNREACHABLE_CODE = YES;
260 | 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
261 | 				COPY_PHASE_STRIP = NO;
262 | 				DEBUG_INFORMATION_FORMAT = dwarf;
263 | 				ENABLE_STRICT_OBJC_MSGSEND = YES;
264 | 				ENABLE_TESTABILITY = YES;
265 | 				GCC_C_LANGUAGE_STANDARD = gnu11;
266 | 				GCC_DYNAMIC_NO_PIC = NO;
267 | 				GCC_NO_COMMON_BLOCKS = YES;
268 | 				GCC_OPTIMIZATION_LEVEL = 0;
269 | 				GCC_PREPROCESSOR_DEFINITIONS = (
270 | 					"DEBUG=1",
271 | 					"$(inherited)",
272 | 				);
273 | 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
274 | 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
275 | 				GCC_WARN_UNDECLARED_SELECTOR = YES;
276 | 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
277 | 				GCC_WARN_UNUSED_FUNCTION = YES;
278 | 				GCC_WARN_UNUSED_VARIABLE = YES;
279 | 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
280 | 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
281 | 				MTL_FAST_MATH = YES;
282 | 				ONLY_ACTIVE_ARCH = YES;
283 | 				SDKROOT = iphoneos;
284 | 				SWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;
285 | 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
286 | 			};
287 | 			name = Debug;
288 | 		};
289 | 		54B223EF29C5CF9F006F4683 /* Release */ = {
290 | 			isa = XCBuildConfiguration;
291 | 			buildSettings = {
292 | 				ALWAYS_SEARCH_USER_PATHS = NO;
293 | 				CLANG_ANALYZER_NONNULL = YES;
294 | 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
295 | 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
296 | 				CLANG_ENABLE_MODULES = YES;
297 | 				CLANG_ENABLE_OBJC_ARC = YES;
298 | 				CLANG_ENABLE_OBJC_WEAK = YES;
299 | 				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
300 | 				CLANG_WARN_BOOL_CONVERSION = YES;
301 | 				CLANG_WARN_COMMA = YES;
302 | 				CLANG_WARN_CONSTANT_CONVERSION = YES;
303 | 				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
304 | 				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
305 | 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
306 | 				CLANG_WARN_EMPTY_BODY = YES;
307 | 				CLANG_WARN_ENUM_CONVERSION = YES;
308 | 				CLANG_WARN_INFINITE_RECURSION = YES;
309 | 				CLANG_WARN_INT_CONVERSION = YES;
310 | 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
311 | 				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
312 | 				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
313 | 				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
314 | 				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
315 | 				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
316 | 				CLANG_WARN_STRICT_PROTOTYPES = YES;
317 | 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
318 | 				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
319 | 				CLANG_WARN_UNREACHABLE_CODE = YES;
320 | 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
321 | 				COPY_PHASE_STRIP = NO;
322 | 				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
323 | 				ENABLE_NS_ASSERTIONS = NO;
324 | 				ENABLE_STRICT_OBJC_MSGSEND = YES;
325 | 				GCC_C_LANGUAGE_STANDARD = gnu11;
326 | 				GCC_NO_COMMON_BLOCKS = YES;
327 | 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
328 | 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
329 | 				GCC_WARN_UNDECLARED_SELECTOR = YES;
330 | 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
331 | 				GCC_WARN_UNUSED_FUNCTION = YES;
332 | 				GCC_WARN_UNUSED_VARIABLE = YES;
333 | 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
334 | 				MTL_ENABLE_DEBUG_INFO = NO;
335 | 				MTL_FAST_MATH = YES;
336 | 				SDKROOT = iphoneos;
337 | 				SWIFT_COMPILATION_MODE = wholemodule;
338 | 				SWIFT_OPTIMIZATION_LEVEL = "-O";
339 | 				VALIDATE_PRODUCT = YES;
340 | 			};
341 | 			name = Release;
342 | 		};
343 | 		54B223F129C5CF9F006F4683 /* Debug */ = {
344 | 			isa = XCBuildConfiguration;
345 | 			baseConfigurationReference = 5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */;
346 | 			buildSettings = {
347 | 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
348 | 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
349 | 				CODE_SIGN_ENTITLEMENTS = "AlpacaChatApp/Supporting Files/AlpacaChatApp.entitlements";
350 | 				CURRENT_PROJECT_VERSION = 1;
351 | 				DEVELOPMENT_ASSET_PATHS = "\"AlpacaChatApp/Preview Content\"";
352 | 				ENABLE_PREVIEWS = YES;
353 | 				GENERATE_INFOPLIST_FILE = YES;
354 | 				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
355 | 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
356 | 				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
357 | 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
358 | 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
359 | 				LD_RUNPATH_SEARCH_PATHS = (
360 | 					"$(inherited)",
361 | 					"@executable_path/Frameworks",
362 | 				);
363 | 				MARKETING_VERSION = 1.0;
364 | 				PRODUCT_NAME = "$(TARGET_NAME)";
365 | 				SWIFT_EMIT_LOC_STRINGS = YES;
366 | 				SWIFT_VERSION = 5.0;
367 | 				TARGETED_DEVICE_FAMILY = "1,2";
368 | 			};
369 | 			name = Debug;
370 | 		};
371 | 		54B223F229C5CF9F006F4683 /* Release */ = {
372 | 			isa = XCBuildConfiguration;
373 | 			baseConfigurationReference = 5423912629C5D1840041C234 /* AlpacaChatApp.xcconfig */;
374 | 			buildSettings = {
375 | 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
376 | 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
377 | 				CODE_SIGN_ENTITLEMENTS = "AlpacaChatApp/Supporting Files/AlpacaChatApp.entitlements";
378 | 				CURRENT_PROJECT_VERSION = 1;
379 | 				DEVELOPMENT_ASSET_PATHS = "\"AlpacaChatApp/Preview Content\"";
380 | 				ENABLE_PREVIEWS = YES;
381 | 				GENERATE_INFOPLIST_FILE = YES;
382 | 				INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES;
383 | 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
384 | 				INFOPLIST_KEY_UILaunchScreen_Generation = YES;
385 | 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
386 | 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
387 | 				LD_RUNPATH_SEARCH_PATHS = (
388 | 					"$(inherited)",
389 | 					"@executable_path/Frameworks",
390 | 				);
391 | 				MARKETING_VERSION = 1.0;
392 | 				PRODUCT_NAME = "$(TARGET_NAME)";
393 | 				SWIFT_EMIT_LOC_STRINGS = YES;
394 | 				SWIFT_VERSION = 5.0;
395 | 				TARGETED_DEVICE_FAMILY = "1,2";
396 | 			};
397 | 			name = Release;
398 | 		};
399 | /* End XCBuildConfiguration section */
400 | 
401 | /* Begin XCConfigurationList section */
402 | 		54B223DD29C5CF9F006F4683 /* Build configuration list for PBXProject "AlpacaChatApp" */ = {
403 | 			isa = XCConfigurationList;
404 | 			buildConfigurations = (
405 | 				54B223EE29C5CF9F006F4683 /* Debug */,
406 | 				54B223EF29C5CF9F006F4683 /* Release */,
407 | 			);
408 | 			defaultConfigurationIsVisible = 0;
409 | 			defaultConfigurationName = Release;
410 | 		};
411 | 		54B223F029C5CF9F006F4683 /* Build configuration list for PBXNativeTarget "AlpacaChatApp" */ = {
412 | 			isa = XCConfigurationList;
413 | 			buildConfigurations = (
414 | 				54B223F129C5CF9F006F4683 /* Debug */,
415 | 				54B223F229C5CF9F006F4683 /* Release */,
416 | 			);
417 | 			defaultConfigurationIsVisible = 0;
418 | 			defaultConfigurationName = Release;
419 | 		};
420 | /* End XCConfigurationList section */
421 | 
422 | /* Begin XCSwiftPackageProductDependency section */
423 | 		54B223FB29C5D075006F4683 /* AlpacaChat */ = {
424 | 			isa = XCSwiftPackageProductDependency;
425 | 			productName = AlpacaChat;
426 | 		};
427 | /* End XCSwiftPackageProductDependency section */
428 | 	};
429 | 	rootObject = 54B223DA29C5CF9F006F4683 /* Project object */;
430 | }
431 | 


--------------------------------------------------------------------------------
/Sources/alpaca.cpp/utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | 
  3 | #include <cassert>
  4 | #include <cstring>
  5 | #include <fstream>
  6 | #include <regex>
  7 | #include <iostream>
  8 | #include <iterator>
  9 | #include <string>
 10 | #include <math.h>
 11 | 
 12 |  #if defined(_MSC_VER) || defined(__MINGW32__)
 13 |  #include <malloc.h> // using malloc.h with MSC/MINGW
 14 |  #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
 15 |  #include <alloca.h>
 16 |  #endif
 17 | 
 18 | bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
 19 |     for (int i = 1; i < argc; i++) {
 20 |         std::string arg = argv[i];
 21 | 
 22 |         if (arg == "-s" || arg == "--seed") {
 23 |             params.seed = std::stoi(argv[++i]);
 24 |         } else if (arg == "-t" || arg == "--threads") {
 25 |             params.n_threads = std::stoi(argv[++i]);
 26 |         } else if (arg == "-p" || arg == "--prompt") {
 27 |             params.prompt = argv[++i];
 28 |         } else if (arg == "-f" || arg == "--file") {
 29 | 
 30 |             std::ifstream file(argv[++i]);
 31 | 
 32 |             std::copy(std::istreambuf_iterator<char>(file),
 33 |                     std::istreambuf_iterator<char>(),
 34 |                     back_inserter(params.prompt));
 35 |                 
 36 |         } else if (arg == "-n" || arg == "--n_predict") {
 37 |             params.n_predict = std::stoi(argv[++i]);
 38 |         } else if (arg == "--top_k") {
 39 |             params.top_k = std::stoi(argv[++i]);
 40 |         } else if (arg == "-c" || arg == "--ctx_size") {
 41 |             params.n_ctx = std::stoi(argv[++i]);
 42 |         } else if (arg == "--top_p") {
 43 |             params.top_p = std::stof(argv[++i]);
 44 |         } else if (arg == "--temp") {
 45 |             params.temp = std::stof(argv[++i]);
 46 |         } else if (arg == "--repeat_last_n") {
 47 |             params.repeat_last_n = std::stoi(argv[++i]);
 48 |         } else if (arg == "--repeat_penalty") {
 49 |             params.repeat_penalty = std::stof(argv[++i]);
 50 |         } else if (arg == "-b" || arg == "--batch_size") {
 51 |             params.n_batch = std::stoi(argv[++i]);
 52 |         } else if (arg == "-m" || arg == "--model") {
 53 |             params.model = argv[++i];
 54 |         } else if (arg == "-i" || arg == "--interactive") {
 55 |             params.interactive = true;
 56 |         } else if (arg == "--interactive-start") {
 57 |             params.interactive = true;
 58 |             params.interactive_start = true;
 59 |         } else if (arg == "--color") {
 60 |             params.use_color = true;
 61 |         } else if (arg == "-r" || arg == "--reverse-prompt") {
 62 |             params.antiprompt = argv[++i];
 63 |         } else if (arg == "-h" || arg == "--help") {
 64 |             gpt_print_usage(argc, argv, params);
 65 |             exit(0);
 66 |         } else {
 67 |             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
 68 |             gpt_print_usage(argc, argv, params);
 69 |             exit(0);
 70 |         }
 71 |     }
 72 | 
 73 |     return true;
 74 | }
 75 | 
 76 | void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
 77 |     fprintf(stderr, "usage: %s [options]\n", argv[0]);
 78 |     fprintf(stderr, "\n");
 79 |     fprintf(stderr, "options:\n");
 80 |     fprintf(stderr, "  -h, --help            show this help message and exit\n");
 81 |     fprintf(stderr, "  -i, --interactive     run in interactive mode\n");
 82 |     fprintf(stderr, "  --interactive-start   run in interactive mode and poll user input at startup\n");
 83 |     fprintf(stderr, "  -r PROMPT, --reverse-prompt PROMPT\n");
 84 |     fprintf(stderr, "                        in interactive mode, poll user input upon seeing PROMPT\n");
 85 |     fprintf(stderr, "  --color               colorise output to distinguish prompt and user input from generations\n");
 86 |     fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
 87 |     fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
 88 |     fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
 89 |     fprintf(stderr, "                        prompt to start generation with (default: random)\n");
 90 |     fprintf(stderr, "  -f FNAME, --file FNAME\n");
 91 |     fprintf(stderr, "                        prompt file to start generation.\n");
 92 |     fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
 93 |     fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
 94 |     fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
 95 |     fprintf(stderr, "  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n);
 96 |     fprintf(stderr, "  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty);
 97 |     fprintf(stderr, "  -c N, --ctx_size N    size of the prompt context (default: %d)\n", params.n_ctx);
 98 |     fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
 99 |     fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
100 |     fprintf(stderr, "  -m FNAME, --model FNAME\n");
101 |     fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
102 |     fprintf(stderr, "\n");
103 | }
104 | 
105 | std::string gpt_random_prompt(std::mt19937 & rng) {
106 |     const int r = rng() % 10;
107 |     switch (r) {
108 |         case 0: return "So";
109 |         case 1: return "Once upon a time";
110 |         case 2: return "When";
111 |         case 3: return "The";
112 |         case 4: return "After";
113 |         case 5: return "If";
114 |         case 6: return "import";
115 |         case 7: return "He";
116 |         case 8: return "She";
117 |         case 9: return "They";
118 |         default: return "To";
119 |     }
120 | 
121 |     return "The";
122 | }
123 | 
124 | void replace(std::string & str, const std::string & needle, const std::string & replacement) {
125 |     size_t pos = 0;
126 |     while ((pos = str.find(needle, pos)) != std::string::npos) {
127 |         str.replace(pos, needle.length(), replacement);
128 |         pos += replacement.length();
129 |     }
130 | }
131 | 
132 | std::map<std::string, int32_t> json_parse(const std::string & fname) {
133 |     std::map<std::string, int32_t> result;
134 | 
135 |     // read file into string
136 |     std::string json;
137 |     {
138 |         std::ifstream ifs(fname);
139 |         if (!ifs) {
140 |             fprintf(stderr, "Failed to open %s\n", fname.c_str());
141 |             exit(1);
142 |         }
143 | 
144 |         json = std::string((std::istreambuf_iterator<char>(ifs)),
145 |                 (std::istreambuf_iterator<char>()));
146 |     }
147 | 
148 |     if (json[0] != '{') {
149 |         return result;
150 |     }
151 | 
152 |     // parse json
153 |     {
154 |         bool has_key  = false;
155 |         bool in_token = false;
156 | 
157 |         std::string str_key = "";
158 |         std::string str_val = "";
159 | 
160 |         int n = json.size();
161 |         for (int i = 1; i < n; ++i) {
162 |             if (!in_token) {
163 |                 if (json[i] == ' ') continue;
164 |                 if (json[i] == '"') {
165 |                     in_token = true;
166 |                     continue;
167 |                 }
168 |             } else {
169 |                 if (json[i] == '\\' && i+1 < n) {
170 |                     if (has_key == false) {
171 |                         str_key += json[i];
172 |                     } else {
173 |                         str_val += json[i];
174 |                     }
175 |                     ++i;
176 |                 } else if (json[i] == '"') {
177 |                     if (has_key == false) {
178 |                         has_key = true;
179 |                         ++i;
180 |                         while (json[i] == ' ') ++i;
181 |                         ++i; // :
182 |                         while (json[i] == ' ') ++i;
183 |                         if (json[i] != '\"') {
184 |                             while (json[i] != ',' && json[i] != '}') {
185 |                                 str_val += json[i++];
186 |                             }
187 |                             has_key = false;
188 |                         } else {
189 |                             in_token = true;
190 |                             continue;
191 |                         }
192 |                     } else {
193 |                         has_key = false;
194 |                     }
195 | 
196 |                     ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
197 |                     ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
198 |                     ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
199 | 
200 |                     try {
201 |                         result[str_key] = std::stoi(str_val);
202 |                     } catch (...) {
203 |                         //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
204 | 
205 |                     }
206 |                     str_key = "";
207 |                     str_val = "";
208 |                     in_token = false;
209 |                     continue;
210 |                 }
211 |                 if (has_key == false) {
212 |                     str_key += json[i];
213 |                 } else {
214 |                     str_val += json[i];
215 |                 }
216 |             }
217 |         }
218 |     }
219 | 
220 |     return result;
221 | }
222 | 
223 | std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
224 |     std::vector<std::string> words;
225 | 
226 |     // first split the text into words
227 |     {
228 |         std::string str = text;
229 |         std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
230 | 
231 |         std::regex re(pat);
232 |         std::smatch m;
233 | 
234 |         while (std::regex_search(str, m, re)) {
235 |             for (auto x : m) {
236 |                 words.push_back(x);
237 |             }
238 |             str = m.suffix();
239 |         }
240 |     }
241 | 
242 |     // find the longest tokens that form the words:
243 |     std::vector<gpt_vocab::id> tokens;
244 |     for (const auto & word : words) {
245 |         if (word.size() == 0) continue;
246 | 
247 |         int i = 0;
248 |         int n = word.size();
249 |         while (i < n) {
250 |             int j = n;
251 |             while (j > i) {
252 |                 auto it = vocab.token_to_id.find(word.substr(i, j-i));
253 |                 if (it != vocab.token_to_id.end()) {
254 |                     tokens.push_back(it->second);
255 |                     i = j;
256 |                     break;
257 |                 }
258 |                 --j;
259 |             }
260 |             if (i == n) {
261 |                 break;
262 |             }
263 |             if (j == i) {
264 |                 auto sub = word.substr(i, 1);
265 |                 if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
266 |                     tokens.push_back(vocab.token_to_id.at(sub));
267 |                 } else {
268 |                     fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
269 |                 }
270 |                 ++i;
271 |             }
272 |         }
273 |     }
274 | 
275 |     return tokens;
276 | }
277 | 
278 | // TODO: Calculate this constant from the vocabulary
279 | #define MAX_TOKEN_LEN 18
280 | // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
281 | std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
282 |     std::vector<gpt_vocab::id> res;
283 |     std::vector<int> score;
284 |     std::vector<gpt_vocab::id> prev;
285 |     int len = text.length();
286 | 
287 |     score.resize(len + 1);
288 |     prev.resize(len + 1);
289 | 
290 |     // Forward pass
291 |     for (int i = 0; i < len; i++) {
292 |         int max_len = std::min(len - i, MAX_TOKEN_LEN);
293 |         for (int sub_len = 1; sub_len <= len - i; sub_len++) {
294 |             auto sub = text.substr(i, sub_len);
295 |             auto token = vocab.token_to_id.find(sub);
296 |             if (token != vocab.token_to_id.end()) {
297 |                 int token_score = sub.length() * sub.length();
298 |                 int local_score = score[i] + token_score;
299 |                 int next = i + sub_len;
300 |                 if (score[next] < local_score) {
301 |                     score[next] = local_score;
302 |                     prev[next] = (*token).second;
303 |                 }
304 |             }
305 |         }
306 |     }
307 | 
308 |     // Backward pass
309 |     int i = len;
310 |     while (i > 0) {
311 |         gpt_vocab::id token_id = prev[i];
312 |         if (token_id == 0) {
313 | 	    // TODO: Return error or something more meaningful
314 |             printf("failed to tokenize string!\n");
315 | 	    break;
316 |         }
317 |         res.push_back(token_id);
318 |         auto token = (*vocab.id_to_token.find(token_id)).second;
319 |         i -= token.length();
320 |     }
321 | 
322 |     if (bos) {
323 |         res.push_back(1); // TODO: replace with vocab.bos
324 |     }
325 | 
326 |     // Pieces are in reverse order so correct that
327 |     std::reverse(res.begin(), res.end());
328 | 
329 |     return res;
330 | }
331 | 
332 | bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
333 |     printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
334 | 
335 |     vocab.token_to_id = ::json_parse(fname);
336 | 
337 |     for (const auto & kv : vocab.token_to_id) {
338 |         vocab.id_to_token[kv.second] = kv.first;
339 |     }
340 | 
341 |     printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
342 | 
343 |     // print the vocabulary
344 |     //for (auto kv : vocab.token_to_id) {
345 |     //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
346 |     //}
347 | 
348 |     return true;
349 | }
350 | 
351 | 
352 | void sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k) {
353 |     // find the top K tokens
354 |     std::partial_sort(
355 |             logits_id.begin(),
356 |             logits_id.begin() + top_k, logits_id.end(),
357 |             [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
358 |         return a.first > b.first;
359 |     });
360 | 
361 |     logits_id.resize(top_k);
362 | }
363 | 
364 | gpt_vocab::id llama_sample_top_p_top_k(
365 |         const gpt_vocab & vocab,
366 |         const float * logits,
367 |         std::vector<gpt_vocab::id> & last_n_tokens,
368 |         double repeat_penalty,
369 |         int top_k,
370 |         double top_p,
371 |         double temp,
372 |         std::mt19937 & rng) {
373 |     int n_logits = vocab.id_to_token.size();
374 | 
375 |     std::vector<std::pair<double, gpt_vocab::id>> logits_id;
376 |     logits_id.reserve(n_logits);
377 | 
378 |     {
379 |         const double scale = 1.0/temp;
380 |         for (int i = 0; i < n_logits; ++i) {
381 |             // repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
382 |             // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
383 |             if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
384 |                 // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
385 |                 if (logits[i] < 0.0) {
386 |                     logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));
387 |                 } else {
388 |                     logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));
389 |                 }                
390 |             } else {
391 |                 logits_id.push_back(std::make_pair(logits[i]*scale, i));
392 |             }
393 |         }
394 |     }
395 | 
396 |     sample_top_k(logits_id, top_k);
397 | 
398 |     double maxl = -INFINITY;
399 |     for (const auto & kv : logits_id) {
400 |         maxl = std::max(maxl, kv.first);
401 |     }
402 | 
403 |     // compute probs for the top K tokens
404 |     std::vector<double> probs;
405 |     probs.reserve(logits_id.size());
406 | 
407 |     double sum = 0.0;
408 |     for (const auto & kv : logits_id) {
409 |         double p = exp(kv.first - maxl);
410 |         probs.push_back(p);
411 |         sum += p;
412 |     }
413 | 
414 |     // normalize the probs
415 |     for (auto & p : probs) {
416 |         p /= sum;
417 |     }
418 | 
419 |     if (top_p < 1.0f) {
420 |         double cumsum = 0.0f;
421 |         for (int i = 0; i < (int) probs.size(); i++) {
422 |             cumsum += probs[i];
423 |             if (cumsum >= top_p) {
424 |                 probs.resize(i + 1);
425 |                 logits_id.resize(i + 1);
426 |                 break;
427 |             }
428 |         }
429 | 
430 |         cumsum = 1.0/cumsum;
431 |         for (int i = 0; i < (int) probs.size(); i++) {
432 |             probs[i] *= cumsum;
433 |         }
434 |     }
435 | 
436 |     //printf("\n");
437 |     //for (int i = 0; i < (int) 10; i++) {
438 |     //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
439 |     //}
440 |     //printf("\n\n");
441 |     //exit(0);
442 | 
443 |     std::discrete_distribution<> dist(probs.begin(), probs.end());
444 |     int idx = dist(rng);
445 | 
446 |     return logits_id[idx].second;
447 | }
448 | 
449 | 
450 | size_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
451 |     const int nb = k / qk;
452 |     const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);
453 |     const size_t row_size = nb*bs;
454 | 
455 |     assert(k % qk == 0);
456 | 
457 |     const size_t pp_size = qk / 2;
458 |     uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
459 | 
460 |     char * pdst = (char *) dst;
461 | 
462 |     for (int j = 0; j < n; j += k) {
463 |         uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);
464 |         uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));
465 | 
466 |         for (int i = 0; i < nb; i++) {
467 |             float amax = 0.0f; // absolute max
468 | 
469 |             {
470 |                 for (int l = 0; l < qk; l++) {
471 |                     const float v = src[j + i*qk + l];
472 |                     amax = std::max(amax, fabsf(v));
473 |                 }
474 | 
475 |                 const float d = amax / ((1 << 3) - 1);
476 |                 const float id = d ? 1.0f/d : 0.0f;
477 | 
478 |                 *(float *) pd = d;
479 |                 pd += bs;
480 | 
481 |                 for (int l = 0; l < qk; l += 2) {
482 |                     const float v0 = (src[j + i*qk + l + 0])*id;
483 |                     const float v1 = (src[j + i*qk + l + 1])*id;
484 | 
485 |                     const uint8_t vi0 = ((int8_t) (round(v0))) + 8;
486 |                     const uint8_t vi1 = ((int8_t) (round(v1))) + 8;
487 | 
488 |                     assert(vi0 >= 0 && vi0 < 16);
489 |                     assert(vi1 >= 0 && vi1 < 16);
490 | 
491 |                     hist[vi0]++;
492 |                     hist[vi1]++;
493 | 
494 |                     pp[l/2] = vi0 | (vi1 << 4);
495 |                 }
496 | 
497 |                 memcpy(pb, pp, pp_size);
498 |                 pb += bs;
499 |             }
500 |         }
501 |     }
502 | 
503 |     return (n/k)*row_size;
504 | }
505 | 
506 | size_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {
507 |     const int nb = k / qk;
508 |     const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);
509 | 
510 |     assert(k % qk == 0);
511 | 
512 |     const size_t pp_size = qk / 2;
513 |     uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));
514 | 
515 |     char * pdst = (char *) dst;
516 | 
517 |     for (int j = 0; j < n; j += k) {
518 |         float   * pm = (float *)   (pdst + (j/k)*row_size);
519 |         float   * pd = (float *)   (pm + nb);
520 |         uint8_t * pb = (uint8_t *) (pd + nb);
521 | 
522 |         //printf("n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\n", n, k, nb, row_size, j, pm, pd, pb);
523 | 
524 |         for (int i = 0; i < nb; i++) {
525 |             float min = std::numeric_limits<float>::max();
526 |             float max = std::numeric_limits<float>::min();
527 | 
528 |             {
529 |                 for (int l = 0; l < qk; l++) {
530 |                     const float v = src[j + i*qk + l];
531 |                     if (v < min) min = v;
532 |                     if (v > max) max = v;
533 |                 }
534 | 
535 |                 const float d = (max - min) / ((1 << 4) - 1);
536 |                 const float id = d ? 1.0f/d : 0.0f;
537 | 
538 |                 pm[i] = min;
539 |                 pd[i] = d;
540 | 
541 |                 for (int l = 0; l < qk; l += 2) {
542 |                     const float v0 = (src[j + i*qk + l + 0] - min)*id;
543 |                     const float v1 = (src[j + i*qk + l + 1] - min)*id;
544 | 
545 |                     const uint8_t vi0 = round(v0);
546 |                     const uint8_t vi1 = round(v1);
547 | 
548 |                     assert(vi0 >= 0 && vi0 < 16);
549 |                     assert(vi1 >= 0 && vi1 < 16);
550 | 
551 |                     hist[vi0]++;
552 |                     hist[vi1]++;
553 | 
554 |                     pp[l/2] = vi0 | (vi1 << 4);
555 |                 }
556 | 
557 |                 memcpy(pb + i*qk/2, pp, pp_size);
558 |             }
559 |         }
560 |     }
561 | 
562 |     return (n/k)*row_size;
563 | }
564 | 


--------------------------------------------------------------------------------
/Sources/alpaca.cpp/include/ggml.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | //
  4 | // GGML Tensor Library
  5 | //
  6 | // This documentation is still a work in progress.
  7 | // If you wish some specific topics to be covered, feel free to drop a comment:
  8 | //
  9 | //   https://github.com/ggerganov/whisper.cpp/issues/40
 10 | //
 11 | // ## Overview
 12 | //
 13 | // This library implements:
 14 | //
 15 | //  - a set of tensor operations
 16 | //  - automatic differentiation
 17 | //  - basic optimization algorithms
 18 | //
 19 | // The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
 20 | // but is not limited to, the following:
 21 | //
 22 | //  - linear regression
 23 | //  - support vector machines
 24 | //  - neural networks
 25 | //
 26 | // The library allows the user to define a certain function using the available tensor operations. This function
 27 | // definition is represented internally via a computation graph. Each tensor operation in the function definition
 28 | // corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
 29 | // function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
 30 | // using one of the available optimization algorithms.
 31 | //
 32 | // For example, here we define the function: f(x) = a*x^2 + b
 33 | //
 34 | //   {
 35 | //       struct ggml_init_params params = {
 36 | //           .mem_size   = 16*1024*1024,
 37 | //           .mem_buffer = NULL,
 38 | //       };
 39 | //
 40 | //       // memory allocation happens here
 41 | //       struct ggml_context * ctx = ggml_init(params);
 42 | //
 43 | //       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 44 | //
 45 | //       ggml_set_param(ctx, x); // x is an input variable
 46 | //
 47 | //       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 48 | //       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
 49 | //       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
 50 | //       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
 51 | //
 52 | //       ...
 53 | //   }
 54 | //
 55 | // Notice that the function definition above does not involve any actual computation. The computation is performed only
 56 | // when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
 57 | //
 58 | //   {
 59 | //       ...
 60 | //
 61 | //       struct ggml_cgraph gf = ggml_build_forward(f);
 62 | //
 63 | //       // set the input variable and parameter values
 64 | //       ggml_set_f32(x, 2.0f);
 65 | //       ggml_set_f32(a, 3.0f);
 66 | //       ggml_set_f32(b, 4.0f);
 67 | //
 68 | //       ggml_graph_compute(ctx0, &gf);
 69 | //
 70 | //       printf("f = %f\n", ggml_get_f32_1d(f, 0));
 71 | //
 72 | //       ...
 73 | //   }
 74 | //
 75 | // The actual computation is performed in the ggml_graph_compute() function.
 76 | //
 77 | // The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
 78 | // ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
 79 | // in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
 80 | // and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
 81 | // actually needed.
 82 | //
 83 | // The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
 84 | // differentiation and optimization algorithms.
 85 | //
 86 | // The described approach allows to define the function graph once and then compute its forward or backward graphs
 87 | // multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
 88 | // the user can avoid the memory allocation overhead at runtime.
 89 | //
 90 | // The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
 91 | // citizens, but in theory the library can be extended to support FP8 and integer data types.
 92 | //
 93 | // Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
 94 | // and binary operations. Most of the available operations fall into one of these two categories. With time, it became
 95 | // clear that the library needs to support more complex operations. The way to support these operations is not clear
 96 | // yet, but a few examples are demonstrated in the following operations:
 97 | //
 98 | //   - ggml_permute()
 99 | //   - ggml_conv_1d_1s()
100 | //   - ggml_conv_1d_2s()
101 | //
102 | // For each tensor operator, the library implements a forward and backward computation function. The forward function
103 | // computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
104 | // input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
105 | // calculus class, or watch the following video:
106 | //
107 | //   What is Automatic Differentiation?
108 | //   https://www.youtube.com/watch?v=wG_nF1awSSY
109 | //
110 | //
111 | // ## Tensor data (struct ggml_tensor)
112 | //
113 | // The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
114 | // the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
115 | // pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
116 | //
117 | //   {
118 | //       struct ggml_tensor * c = ggml_add(ctx, a, b);
119 | //
120 | //       assert(c->src[0] == a);
121 | //       assert(c->src[1] == b);
122 | //   }
123 | //
124 | // The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
125 | // number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
126 | // to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
127 | // permutation. All tensor operations have to take the stride into account and not assume that the tensor is
128 | // contiguous in memory.
129 | //
130 | // The data of the tensor is accessed via the "data" pointer. For example:
131 | //
132 | //   {
133 | //       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
134 | //
135 | //       // a[1, 2] = 1.0f;
136 | //       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
137 | //
138 | //       // a[2, 0] = 2.0f;
139 | //       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
140 | //
141 | //       ...
142 | //   }
143 | //
144 | // Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
145 | //
146 | // ## The matrix multiplication operator (ggml_mul_mat)
147 | //
148 | // TODO
149 | //
150 | //
151 | // ## Multi-threading
152 | //
153 | // TODO
154 | //
155 | //
156 | // ## Overview of ggml.c
157 | //
158 | // TODO
159 | //
160 | //
161 | // ## SIMD optimizations
162 | //
163 | // TODO
164 | //
165 | //
166 | // ## Debugging ggml
167 | //
168 | // TODO
169 | //
170 | //
171 | 
172 | #ifdef  __cplusplus
173 | extern "C" {
174 | #endif
175 | 
176 | #include <stdint.h>
177 | #include <stddef.h>
178 | #include <stdbool.h>
179 | 
180 | #define GGML_MAX_DIMS     4
181 | #define GGML_MAX_NODES    4096
182 | #define GGML_MAX_PARAMS   16
183 | #define GGML_MAX_CONTEXTS 64
184 | #define GGML_MAX_OPT      4
185 | 
186 | #ifdef __ARM_NEON
187 | // we use the built-in 16-bit float type
188 | typedef __fp16 ggml_fp16_t;
189 | #else
190 | typedef uint16_t ggml_fp16_t;
191 | #endif
192 | 
193 | // convert FP16 <-> FP32
194 | float       ggml_fp16_to_fp32(ggml_fp16_t x);
195 | ggml_fp16_t ggml_fp32_to_fp16(float x);
196 | 
197 | struct ggml_object;
198 | struct ggml_context;
199 | 
200 | enum ggml_type {
201 |     GGML_TYPE_Q4_0,
202 |     GGML_TYPE_Q4_1,
203 |     GGML_TYPE_I8,
204 |     GGML_TYPE_I16,
205 |     GGML_TYPE_I32,
206 |     GGML_TYPE_F16,
207 |     GGML_TYPE_F32,
208 |     GGML_TYPE_COUNT,
209 | };
210 | 
211 | // available tensor operations:
212 | enum ggml_op {
213 |     GGML_OP_NONE = 0,
214 | 
215 |     GGML_OP_DUP,
216 |     GGML_OP_ADD,
217 |     GGML_OP_SUB,
218 |     GGML_OP_MUL,
219 |     GGML_OP_DIV,
220 |     GGML_OP_SQR,
221 |     GGML_OP_SQRT,
222 |     GGML_OP_SUM,
223 |     GGML_OP_MEAN,
224 |     GGML_OP_REPEAT,
225 |     GGML_OP_ABS,
226 |     GGML_OP_SGN,
227 |     GGML_OP_NEG,
228 |     GGML_OP_STEP,
229 |     GGML_OP_RELU,
230 |     GGML_OP_GELU,
231 |     GGML_OP_SILU,
232 |     GGML_OP_NORM, // normalize
233 |     GGML_OP_RMS_NORM,
234 | 
235 |     GGML_OP_MUL_MAT,
236 | 
237 |     GGML_OP_SCALE,
238 |     GGML_OP_CPY,
239 |     GGML_OP_RESHAPE,
240 |     GGML_OP_VIEW,
241 |     GGML_OP_PERMUTE,
242 |     GGML_OP_TRANSPOSE,
243 |     GGML_OP_GET_ROWS,
244 |     GGML_OP_DIAG_MASK_INF,
245 |     GGML_OP_SOFT_MAX,
246 |     GGML_OP_ROPE,
247 |     GGML_OP_CONV_1D_1S,
248 |     GGML_OP_CONV_1D_2S,
249 | 
250 |     GGML_OP_FLASH_ATTN,
251 |     GGML_OP_FLASH_FF,
252 | 
253 |     GGML_OP_COUNT,
254 | };
255 | 
256 | // n-dimensional tensor
257 | struct ggml_tensor {
258 |     enum ggml_type type;
259 | 
260 |     int    n_dims;
261 |     int    ne[GGML_MAX_DIMS]; // number of elements
262 |     size_t nb[GGML_MAX_DIMS]; // stride in bytes:
263 |                               // nb[0] = sizeof(type)
264 |                               // nb[1] = nb[0]   * ne[0] + padding
265 |                               // nb[i] = nb[i-1] * ne[i-1]
266 | 
267 |     // compute data
268 |     enum ggml_op op;
269 | 
270 |     bool is_param;
271 | 
272 |     struct ggml_tensor * grad;
273 |     struct ggml_tensor * src0;
274 |     struct ggml_tensor * src1;
275 |     struct ggml_tensor * opt[GGML_MAX_OPT];
276 | 
277 |     // thread scheduling
278 |     int n_tasks;
279 | 
280 |     // performance
281 |     int     perf_runs;
282 |     int64_t perf_cycles;
283 |     int64_t perf_time_us;
284 | 
285 |     void * data;
286 |     char padding[8];
287 | };
288 | 
289 | // computation graph
290 | struct ggml_cgraph {
291 |     int n_nodes;
292 |     int n_leafs;
293 |     int n_threads;
294 | 
295 |     size_t work_size;
296 |     struct ggml_tensor * work;
297 | 
298 |     struct ggml_tensor * nodes[GGML_MAX_NODES];
299 |     struct ggml_tensor * grads[GGML_MAX_NODES];
300 |     struct ggml_tensor * leafs[GGML_MAX_NODES];
301 | 
302 |     // performance
303 |     int     perf_runs;
304 |     int64_t perf_cycles;
305 |     int64_t perf_time_us;
306 | };
307 | 
308 | // scratch buffer
309 | struct ggml_scratch {
310 |     size_t offs;
311 |     size_t size;
312 |     void * data;
313 | };
314 | 
315 | struct ggml_init_params {
316 |     // memory pool
317 |     size_t mem_size;   // bytes
318 |     void * mem_buffer; // if NULL, memory will be allocated internally
319 | };
320 | 
321 | void    ggml_time_init(void); // call this once at the beginning of the program
322 | int64_t ggml_time_ms(void);
323 | int64_t ggml_time_us(void);
324 | int64_t ggml_cycles(void);
325 | int64_t ggml_cycles_per_ms(void);
326 | 
327 | void ggml_print_object (const struct ggml_object * obj);
328 | void ggml_print_objects(const struct ggml_context * ctx);
329 | 
330 | int    ggml_nelements(const struct ggml_tensor * tensor);
331 | size_t ggml_nbytes   (const struct ggml_tensor * tensor);
332 | 
333 | int    ggml_blck_size (enum ggml_type type);
334 | size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
335 | float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
336 | 
337 | size_t ggml_element_size(const struct ggml_tensor * tensor);
338 | 
339 | struct ggml_context * ggml_init(struct ggml_init_params params);
340 | void ggml_free(struct ggml_context * ctx);
341 | 
342 | size_t ggml_used_mem(const struct ggml_context * ctx);
343 | 
344 | size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
345 | 
346 | struct ggml_tensor * ggml_new_tensor(
347 |         struct ggml_context * ctx,
348 |         enum   ggml_type type,
349 |         int    n_dims,
350 |         const int *ne);
351 | 
352 | struct ggml_tensor * ggml_new_tensor_1d(
353 |         struct ggml_context * ctx,
354 |         enum   ggml_type type,
355 |         int    ne0);
356 | 
357 | 
358 | struct ggml_tensor * ggml_new_tensor_1d_dummy(
359 |         struct ggml_context * ctx,
360 |         enum   ggml_type type,
361 |         int    ne0);
362 | 
363 | 
364 | struct ggml_tensor * ggml_new_tensor_2d_dummy(
365 |         struct ggml_context * ctx,
366 |         enum   ggml_type type,
367 |         int    ne0,
368 |         int    ne1);
369 | 
370 | 
371 | struct ggml_tensor * ggml_new_tensor_2d(
372 |         struct ggml_context * ctx,
373 |         enum   ggml_type type,
374 |         int    ne0,
375 |         int    ne1);
376 | 
377 | struct ggml_tensor * ggml_new_tensor_3d(
378 |         struct ggml_context * ctx,
379 |         enum   ggml_type type,
380 |         int    ne0,
381 |         int    ne1,
382 |         int    ne2);
383 | 
384 | struct ggml_tensor * ggml_new_tensor_4d(
385 |         struct ggml_context * ctx,
386 |         enum   ggml_type type,
387 |         int    ne0,
388 |         int    ne1,
389 |         int    ne2,
390 |         int    ne3);
391 | 
392 | struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
393 | struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
394 | 
395 | struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
396 | struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
397 | 
398 | struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
399 | struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
400 | struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
401 | 
402 | int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
403 | void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
404 | 
405 | float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
406 | void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
407 | 
408 |  void * ggml_get_data    (const struct ggml_tensor * tensor);
409 | float * ggml_get_data_f32(const struct ggml_tensor * tensor);
410 | 
411 | //
412 | // operations on tensors with backpropagation
413 | //
414 | 
415 | struct ggml_tensor * ggml_dup(
416 |         struct ggml_context * ctx,
417 |         struct ggml_tensor  * a);
418 | 
419 | struct ggml_tensor * ggml_add(
420 |         struct ggml_context * ctx,
421 |         struct ggml_tensor  * a,
422 |         struct ggml_tensor  * b);
423 | 
424 | struct ggml_tensor * ggml_sub(
425 |         struct ggml_context * ctx,
426 |         struct ggml_tensor  * a,
427 |         struct ggml_tensor  * b);
428 | 
429 | struct ggml_tensor * ggml_mul(
430 |         struct ggml_context * ctx,
431 |         struct ggml_tensor  * a,
432 |         struct ggml_tensor  * b);
433 | 
434 | struct ggml_tensor * ggml_div(
435 |         struct ggml_context * ctx,
436 |         struct ggml_tensor  * a,
437 |         struct ggml_tensor  * b);
438 | 
439 | struct ggml_tensor * ggml_sqr(
440 |         struct ggml_context * ctx,
441 |         struct ggml_tensor  * a);
442 | 
443 | struct ggml_tensor * ggml_sqrt(
444 |         struct ggml_context * ctx,
445 |         struct ggml_tensor  * a);
446 | 
447 | // return scalar
448 | // TODO: compute sum along rows
449 | struct ggml_tensor * ggml_sum(
450 |         struct ggml_context * ctx,
451 |         struct ggml_tensor  * a);
452 | 
453 | // mean along rows
454 | struct ggml_tensor * ggml_mean(
455 |         struct ggml_context * ctx,
456 |         struct ggml_tensor  * a);
457 | 
458 | // if a is the same shape as b, and a is not parameter, return a
459 | // otherwise, return a new tensor: repeat(a) to fit in b
460 | struct ggml_tensor * ggml_repeat(
461 |         struct ggml_context * ctx,
462 |         struct ggml_tensor  * a,
463 |         struct ggml_tensor  * b);
464 | 
465 | struct ggml_tensor * ggml_abs(
466 |         struct ggml_context * ctx,
467 |         struct ggml_tensor  * a);
468 | 
469 | struct ggml_tensor * ggml_sgn(
470 |         struct ggml_context * ctx,
471 |         struct ggml_tensor  * a);
472 | 
473 | struct ggml_tensor * ggml_neg(
474 |         struct ggml_context * ctx,
475 |         struct ggml_tensor  * a);
476 | 
477 | struct ggml_tensor * ggml_step(
478 |         struct ggml_context * ctx,
479 |         struct ggml_tensor  * a);
480 | 
481 | struct ggml_tensor * ggml_relu(
482 |         struct ggml_context * ctx,
483 |         struct ggml_tensor  * a);
484 | 
485 | // TODO: double-check this computation is correct
486 | struct ggml_tensor * ggml_gelu(
487 |         struct ggml_context * ctx,
488 |         struct ggml_tensor  * a);
489 | 
490 | struct ggml_tensor * ggml_silu(
491 |         struct ggml_context * ctx,
492 |         struct ggml_tensor  * a);
493 | 
494 | // normalize along rows
495 | // TODO: eps is hardcoded to 1e-5 for now
496 | struct ggml_tensor * ggml_norm(
497 |         struct ggml_context * ctx,
498 |         struct ggml_tensor  * a);
499 | 
500 | struct ggml_tensor * ggml_rms_norm(
501 |         struct ggml_context * ctx,
502 |         struct ggml_tensor  * a);
503 | 
504 | // A: m rows, n columns
505 | // B: p rows, n columns (i.e. we transpose it internally)
506 | // result is m columns, p rows
507 | struct ggml_tensor * ggml_mul_mat(
508 |         struct ggml_context * ctx,
509 |         struct ggml_tensor  * a,
510 |         struct ggml_tensor  * b);
511 | 
512 | //
513 | // operations on tensors without backpropagation
514 | //
515 | 
516 | // in-place, returns view(a)
517 | struct ggml_tensor * ggml_scale(
518 |         struct ggml_context * ctx,
519 |         struct ggml_tensor  * a,
520 |         struct ggml_tensor  * b);
521 | 
522 | // a -> b, return view(b)
523 | struct ggml_tensor * ggml_cpy(
524 |         struct ggml_context * ctx,
525 |         struct ggml_tensor  * a,
526 |         struct ggml_tensor  * b);
527 | 
528 | // return view(a), b specifies the new shape
529 | // TODO: when we start computing gradient, make a copy instead of view
530 | struct ggml_tensor * ggml_reshape(
531 |         struct ggml_context * ctx,
532 |         struct ggml_tensor  * a,
533 |         struct ggml_tensor  * b);
534 | 
535 | // return view(a)
536 | // TODO: when we start computing gradient, make a copy instead of view
537 | struct ggml_tensor * ggml_reshape_2d(
538 |         struct ggml_context * ctx,
539 |         struct ggml_tensor  * a,
540 |         int                   ne0,
541 |         int                   ne1);
542 | 
543 | // return view(a)
544 | // TODO: when we start computing gradient, make a copy instead of view
545 | struct ggml_tensor * ggml_reshape_3d(
546 |         struct ggml_context * ctx,
547 |         struct ggml_tensor  * a,
548 |         int                   ne0,
549 |         int                   ne1,
550 |         int                   ne2);
551 | 
552 | // offset in bytes
553 | struct ggml_tensor * ggml_view_1d(
554 |         struct ggml_context * ctx,
555 |         struct ggml_tensor  * a,
556 |         int                   ne0,
557 |         size_t                offset);
558 | 
559 | struct ggml_tensor * ggml_view_2d(
560 |         struct ggml_context * ctx,
561 |         struct ggml_tensor  * a,
562 |         int                   ne0,
563 |         int                   ne1,
564 |         size_t                nb1, // row stride in bytes
565 |         size_t                offset);
566 | 
567 | struct ggml_tensor * ggml_permute(
568 |         struct ggml_context * ctx,
569 |         struct ggml_tensor  * a,
570 |         int                   axis0,
571 |         int                   axis1,
572 |         int                   axis2,
573 |         int                   axis3);
574 | 
575 | // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
576 | struct ggml_tensor * ggml_transpose(
577 |         struct ggml_context * ctx,
578 |         struct ggml_tensor  * a);
579 | 
580 | struct ggml_tensor * ggml_get_rows(
581 |         struct ggml_context * ctx,
582 |         struct ggml_tensor  * a,
583 |         struct ggml_tensor  * b);
584 | 
585 | // set elements above the diagonal to -INF
586 | // in-place, returns view(a)
587 | struct ggml_tensor * ggml_diag_mask_inf(
588 |         struct ggml_context * ctx,
589 |         struct ggml_tensor  * a,
590 |         int                   n_past);
591 | 
592 | // in-place, returns view(a)
593 | struct ggml_tensor * ggml_soft_max(
594 |         struct ggml_context * ctx,
595 |         struct ggml_tensor  * a);
596 | 
597 | // rotary position embedding
598 | // in-place, returns view(a)
599 | // if mode == 1, skip n_past elements
600 | // TODO: avoid creating a new tensor every time
601 | struct ggml_tensor * ggml_rope(
602 |         struct ggml_context * ctx,
603 |         struct ggml_tensor  * a,
604 |         int                   n_past,
605 |         int                   n_dims,
606 |         int                   mode);
607 | 
608 | // padding = 1
609 | // TODO: we don't support extra parameters for now
610 | //       that's why we are hard-coding the stride, padding, and dilation
611 | //       not great ..
612 | struct ggml_tensor * ggml_conv_1d_1s(
613 |         struct ggml_context * ctx,
614 |         struct ggml_tensor  * a,
615 |         struct ggml_tensor  * b);
616 | 
617 | struct ggml_tensor * ggml_conv_1d_2s(
618 |         struct ggml_context * ctx,
619 |         struct ggml_tensor  * a,
620 |         struct ggml_tensor  * b);
621 | 
622 | struct ggml_tensor * ggml_flash_attn(
623 |         struct ggml_context * ctx,
624 |         struct ggml_tensor  * q,
625 |         struct ggml_tensor  * k,
626 |         struct ggml_tensor  * v,
627 |         bool                  masked);
628 | 
629 | struct ggml_tensor * ggml_flash_ff(
630 |         struct ggml_context * ctx,
631 |         struct ggml_tensor  * a,
632 |         struct ggml_tensor  * b0,
633 |         struct ggml_tensor  * b1,
634 |         struct ggml_tensor  * c0,
635 |         struct ggml_tensor  * c1);
636 | 
637 | //
638 | // automatic differentiation
639 | //
640 | 
641 | void ggml_set_param(
642 |         struct ggml_context * ctx,
643 |         struct ggml_tensor * tensor);
644 | 
645 | void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
646 | 
647 | struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
648 | struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
649 | 
650 | void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
651 | void ggml_graph_reset  (struct ggml_cgraph * cgraph);
652 | 
653 | // print info and performance information for the graph
654 | void ggml_graph_print(const struct ggml_cgraph * cgraph);
655 | 
656 | // dump the graph into a file using the dot format
657 | void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
658 | 
659 | //
660 | // optimization
661 | //
662 | 
663 | // optimization methods
664 | enum ggml_opt_type {
665 |     GGML_OPT_ADAM,
666 |     GGML_OPT_LBFGS,
667 | };
668 | 
669 | // linesearch methods
670 | enum ggml_linesearch {
671 |     GGML_LINESEARCH_DEFAULT = 1,
672 | 
673 |     GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
674 |     GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
675 |     GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
676 | };
677 | 
678 | // optimization return values
679 | enum ggml_opt_result {
680 |     GGML_OPT_OK = 0,
681 |     GGML_OPT_DID_NOT_CONVERGE,
682 |     GGML_OPT_NO_CONTEXT,
683 |     GGML_OPT_INVALID_WOLFE,
684 |     GGML_OPT_FAIL,
685 | 
686 |     GGML_LINESEARCH_FAIL = -128,
687 |     GGML_LINESEARCH_MINIMUM_STEP,
688 |     GGML_LINESEARCH_MAXIMUM_STEP,
689 |     GGML_LINESEARCH_MAXIMUM_ITERATIONS,
690 |     GGML_LINESEARCH_INVALID_PARAMETERS,
691 | };
692 | 
693 | // optimization parameters
694 | //
695 | //   see ggml.c (ggml_opt_default_params) for default values
696 | //
697 | struct ggml_opt_params {
698 |     enum ggml_opt_type type;
699 | 
700 |     int n_threads;
701 | 
702 |     // delta-based convergence test
703 |     //
704 |     //   if past == 0 - disabled
705 |     //   if past > 0:
706 |     //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
707 |     //
708 |     int past;
709 |     float delta;
710 | 
711 |     // maximum number of iterations without improvement
712 |     //
713 |     //   if 0 - disabled
714 |     //   if > 0:
715 |     //     assume convergence if no cost improvement in this number of iterations
716 |     //
717 |     int max_no_improvement;
718 | 
719 |     bool print_forward_graph;
720 |     bool print_backward_graph;
721 | 
722 |     // ADAM parameters
723 |     struct {
724 |         int n_iter;
725 | 
726 |         float alpha; // learning rate
727 |         float beta1;
728 |         float beta2;
729 |         float eps;   // epsilon for numerical stability
730 |         float eps_f; // epsilon for convergence test
731 |         float eps_g; // epsilon for convergence test
732 |     } adam;
733 | 
734 |     // LBFGS parameters
735 |     struct {
736 |         int m; // number of corrections to approximate the inv. Hessian
737 |         int n_iter;
738 |         int max_linesearch;
739 | 
740 |         float eps;      // convergence tolerance
741 |         float ftol;     // line search tolerance
742 |         float wolfe;
743 |         float min_step;
744 |         float max_step;
745 | 
746 |         enum ggml_linesearch linesearch;
747 |     } lbfgs;
748 | };
749 | 
750 | struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
751 | 
752 | // optimize the function defined by the tensor f
753 | enum ggml_opt_result ggml_opt(
754 |         struct ggml_context * ctx,
755 |         struct ggml_opt_params params,
756 |         struct ggml_tensor * f);
757 | 
758 | //
759 | // system info
760 | //
761 | 
762 | int ggml_cpu_has_avx(void);
763 | int ggml_cpu_has_avx2(void);
764 | int ggml_cpu_has_avx512(void);
765 | int ggml_cpu_has_fma(void);
766 | int ggml_cpu_has_neon(void);
767 | int ggml_cpu_has_arm_fma(void);
768 | int ggml_cpu_has_f16c(void);
769 | int ggml_cpu_has_fp16_va(void);
770 | int ggml_cpu_has_wasm_simd(void);
771 | int ggml_cpu_has_blas(void);
772 | int ggml_cpu_has_sse3(void);
773 | int ggml_cpu_has_vsx(void);
774 | 
775 | #ifdef  __cplusplus
776 | }
777 | #endif
778 | 


--------------------------------------------------------------------------------
/Sources/alpaca.cpp/chat.cpp:
--------------------------------------------------------------------------------
   1 | //
   2 | //  chat.cpp
   3 | //  alpaca.cpp
   4 | //
   5 | //  Created by Yoshimasa Niwa on 3/16/23.
   6 | //
   7 | 
   8 | #include "chat.h"
   9 | 
  10 | #include <cassert>
  11 | #include <cmath>
  12 | #include <cstdio>
  13 | #include <cstring>
  14 | #include <fstream>
  15 | 
  16 | #include <unistd.h>
  17 | #include <sys/mman.h>
  18 | #include <sys/stat.h>
  19 | #include <fcntl.h>
  20 | 
  21 | // determine number of model parts based on the dimension
  22 | static const std::map<int, int> LLAMA_N_PARTS = {
  23 |     { 4096, 1 },
  24 |     { 5120, 1 },
  25 |     { 6656, 4 },
  26 |     { 8192, 8 },
  27 | };
  28 | 
  29 | static int fin_init(mbuf_t& mbuf, const char* fname)
  30 | {
  31 |     int fd, nread;
  32 |     struct stat sb;
  33 |     if((fd = open(fname, O_RDONLY)) < 0){
  34 | #if DEBUG
  35 |         printf("mmap open %s failed\n", fname);
  36 | #endif // DEBUG
  37 |         return -1;
  38 |     }
  39 |     if((fstat(fd, &sb)) == -1 ){
  40 | #if DEBUG
  41 |         printf("fstat failed\n");
  42 | #endif // DEBUG
  43 |         return -1;
  44 |     }
  45 |     char* model_buf = (char*)mmap(\
  46 |         NULL, sb.st_size, PROT_READ, MAP_SHARED, fd, 0); //MAP_SHARED, MAP_PRIVATE
  47 |     if((void*)model_buf ==(void*) -1){
  48 | #if DEBUG
  49 |         printf("mmap failed\n");
  50 | #endif // DEBUG
  51 |         close(fd);
  52 |         return -1;
  53 |     }
  54 |     close(fd);
  55 |     mbuf.buf = model_buf;
  56 |     mbuf.size = (size_t)sb.st_size;
  57 |     mbuf.p =  mbuf.buf;
  58 |     mbuf.oft = 0;
  59 | #if DEBUG
  60 |     printf("mmap 0x%lx~0x%lx, size=0x%lx\r\n", (size_t)(model_buf), (size_t)(model_buf+mbuf.size), mbuf.size);
  61 | #endif // DEBUG
  62 |     return 0;
  63 | }
  64 | 
  65 | static void fin_read(mbuf_t& mbuf, char* dst, size_t len)
  66 | {
  67 |     if(mbuf.oft+len>mbuf.size){
  68 |         len = mbuf.size - mbuf.oft;
  69 |     }
  70 |     memcpy(dst, mbuf.p, len);
  71 |     mbuf.oft+=len;
  72 |     mbuf.p+=len;
  73 | }
  74 | 
  75 | static void fin_read_dummy(mbuf_t& mbuf, char** dst, size_t len)
  76 | {
  77 |     if(mbuf.oft+len>mbuf.size){
  78 |         len = mbuf.size - mbuf.oft;
  79 |     }
  80 |     //memcpy(dst, mbuf.p, len);
  81 |     *dst = mbuf.p;
  82 |     mbuf.oft+=len;
  83 |     mbuf.p+=len;
  84 | }
  85 | 
  86 | static size_t fin_tellg(mbuf_t& mbuf)
  87 | {
  88 |     return mbuf.oft;
  89 | }
  90 | 
  91 | static void fin_seekg(mbuf_t& mbuf, size_t oft)
  92 | {
  93 |     mbuf.oft = oft;
  94 |     mbuf.p = mbuf.buf + oft;
  95 |     return;
  96 | }
  97 | 
  98 | static bool fin_eof(mbuf_t& mbuf)
  99 | {
 100 |     return mbuf.oft>=mbuf.size;
 101 | }
 102 | 
 103 | static void fin_close(mbuf_t& mbuf)
 104 | {
 105 |     if (!mbuf.buf) {
 106 |         return;
 107 |     }
 108 |     munmap(mbuf.buf, mbuf.size);
 109 |     mbuf.buf = NULL;
 110 |     mbuf.size = 0;
 111 |     mbuf.p =  NULL;
 112 |     mbuf.oft = 0;
 113 | }
 114 | 
 115 | void llma_model_unload(llama_model &model)
 116 | {
 117 |     ggml_free(model.ctx);
 118 |     fin_close(model.mbuf);
 119 | }
 120 | 
 121 | // load the model's weights from a file, use mmap to save memory
 122 | bool llama_model_load_lowmem(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
 123 | #if DEBUG
 124 |     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 125 | 
 126 |     fprintf(stderr, "BLAS= %d\n", ggml_cpu_has_blas());
 127 |     fprintf(stderr, "NEON= %d\n", ggml_cpu_has_neon());
 128 |     fprintf(stderr, "ARM_FMA= %d\n", ggml_cpu_has_arm_fma());
 129 | #endif // DEBUG
 130 |     std::vector<char> f_buf(1024*1024);
 131 | 
 132 |     int res=fin_init(model.mbuf, fname.c_str());
 133 |     if(res) return false;
 134 | 
 135 |     // verify magic
 136 |     {
 137 |         uint32_t magic;
 138 |         fin_read(model.mbuf, (char *) &magic, sizeof(magic));
 139 |         if (magic != 0x67676d6c) {
 140 | #if DEBUG
 141 |             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
 142 | #endif // DEBUG
 143 |             return false;
 144 |         }
 145 |     }
 146 | 
 147 |     int n_ff = 0;
 148 |     int n_parts = 0;
 149 | 
 150 |     // load hparams
 151 |     {
 152 |         auto & hparams = model.hparams;
 153 | 
 154 |         fin_read(model.mbuf, (char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
 155 |         //fin_read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
 156 |         fin_read(model.mbuf, (char *) &hparams.n_embd,  sizeof(hparams.n_embd));
 157 |         fin_read(model.mbuf, (char *) &hparams.n_mult,  sizeof(hparams.n_mult));
 158 |         fin_read(model.mbuf, (char *) &hparams.n_head,  sizeof(hparams.n_head));
 159 |         fin_read(model.mbuf, (char *) &hparams.n_layer, sizeof(hparams.n_layer));
 160 |         fin_read(model.mbuf, (char *) &hparams.n_rot,   sizeof(hparams.n_rot));
 161 |         fin_read(model.mbuf, (char *) &hparams.f16,     sizeof(hparams.f16));
 162 | 
 163 |         hparams.n_ctx = n_ctx;
 164 | 
 165 |         n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
 166 |         n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
 167 | 
 168 | #if DEBUG
 169 |         fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
 170 |         fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
 171 |         fprintf(stderr, "%s: n_embd  = %d\n", __func__, hparams.n_embd);
 172 |         fprintf(stderr, "%s: n_mult  = %d\n", __func__, hparams.n_mult);
 173 |         fprintf(stderr, "%s: n_head  = %d\n", __func__, hparams.n_head);
 174 |         fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
 175 |         fprintf(stderr, "%s: n_rot   = %d\n", __func__, hparams.n_rot);
 176 |         fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
 177 |         fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
 178 |         fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
 179 | #endif // DEBUG
 180 |     }
 181 | 
 182 |     // load vocab
 183 |     {
 184 |         const int32_t n_vocab = model.hparams.n_vocab;
 185 | 
 186 |         if (n_vocab != model.hparams.n_vocab) {
 187 | #if DEBUG
 188 |             fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
 189 |                     __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
 190 | #endif // DEBUG
 191 |             return false;
 192 |         }
 193 | 
 194 |         std::string word;
 195 |         for (int i = 0; i < n_vocab; i++) {
 196 |             uint32_t len;
 197 |             fin_read(model.mbuf, (char *) &len, sizeof(len));
 198 | 
 199 |             word.resize(len);
 200 |             fin_read(model.mbuf, (char *) word.data(), len);
 201 | 
 202 |             vocab.token_to_id[word] = i;
 203 |             vocab.id_to_token[i] = word;
 204 | 
 205 |             //if (i < 30000) {
 206 |             //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
 207 |             //}
 208 |         }
 209 |     }
 210 | 
 211 |     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
 212 |     // in order to save memory and also to speed up the computation
 213 |     ggml_type wtype = GGML_TYPE_COUNT;
 214 |     switch (model.hparams.f16) {
 215 |         case 0: wtype = GGML_TYPE_F32;  break;
 216 |         case 1: wtype = GGML_TYPE_F16;  break;
 217 |         case 2: wtype = GGML_TYPE_Q4_0; break;
 218 |         case 3: wtype = GGML_TYPE_Q4_1; break;
 219 |         default:
 220 |         {
 221 | #if DEBUG
 222 |             fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
 223 |                     __func__, fname.c_str(), model.hparams.f16);
 224 | #endif // DEBUG
 225 |             return false;
 226 |         }
 227 |     }
 228 | 
 229 |     const ggml_type wtype2 = GGML_TYPE_F32;
 230 | 
 231 |     auto & ctx = model.ctx;
 232 | 
 233 |     size_t ctx_size = 0;
 234 | 
 235 |     {
 236 |         const auto & hparams = model.hparams;
 237 | 
 238 |         const int n_embd  = hparams.n_embd;
 239 |         const int n_layer = hparams.n_layer;
 240 |         const int n_ctx   = hparams.n_ctx;
 241 |         const int n_vocab = hparams.n_vocab;
 242 | 
 243 |         ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings
 244 | 
 245 |         ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
 246 | 
 247 |         ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output
 248 | 
 249 | /*
 250 |         ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
 251 | 
 252 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
 253 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
 254 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
 255 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
 256 | 
 257 |         ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
 258 | 
 259 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
 260 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
 261 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
 262 | */
 263 | 
 264 |         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
 265 |         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
 266 | 
 267 |         ctx_size += (5 + 10*n_layer)*256; // object overhead
 268 | 
 269 | #if DEBUG
 270 |         fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
 271 | #endif // DEBUG
 272 |     }
 273 | 
 274 |     // create the ggml context
 275 |     {
 276 |         struct ggml_init_params params = {
 277 |             /*.mem_size   =*/ ctx_size,
 278 |             /*.mem_buffer =*/ NULL,
 279 |         };
 280 | 
 281 |         model.ctx = ggml_init(params);
 282 |         if (!model.ctx) {
 283 | #if DEBUG
 284 |             fprintf(stderr, "%s: ggml_init() failed\n", __func__);
 285 | #endif // DEBUG
 286 |             return false;
 287 |         }
 288 |     }
 289 | 
 290 |     // prepare memory for the weights
 291 |     {
 292 |         const auto & hparams = model.hparams;
 293 | 
 294 |         const int n_embd  = hparams.n_embd;
 295 |         const int n_layer = hparams.n_layer;
 296 |         const int n_ctx   = hparams.n_ctx;
 297 |         const int n_vocab = hparams.n_vocab;
 298 | 
 299 |         model.layers.resize(n_layer);
 300 | 
 301 |         model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
 302 | 
 303 |         model.norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 304 |         model.output = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
 305 | 
 306 |         // map by name
 307 |         model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
 308 | 
 309 |         model.tensors["norm.weight"]   = model.norm;
 310 |         model.tensors["output.weight"] = model.output;
 311 | 
 312 |         for (int i = 0; i < n_layer; ++i) {
 313 |             auto & layer = model.layers[i];
 314 | 
 315 |             layer.attention_norm = ggml_new_tensor_1d_dummy(ctx, GGML_TYPE_F32, n_embd);
 316 | 
 317 |             layer.wq = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_embd);
 318 |             layer.wk = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_embd);
 319 |             layer.wv = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_embd);
 320 |             layer.wo = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd, n_embd);
 321 | 
 322 |             layer.ffn_norm = ggml_new_tensor_1d_dummy(ctx, GGML_TYPE_F32, n_embd);
 323 | 
 324 |             layer.w1 = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd,   n_ff);
 325 |             layer.w2 = ggml_new_tensor_2d_dummy(ctx, wtype,   n_ff, n_embd);
 326 |             layer.w3 = ggml_new_tensor_2d_dummy(ctx, wtype, n_embd,   n_ff);
 327 | 
 328 |             // map by name
 329 |             model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
 330 | 
 331 |             model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
 332 |             model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
 333 |             model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
 334 |             model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
 335 | 
 336 |             model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
 337 | 
 338 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
 339 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
 340 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
 341 |         }
 342 |     }
 343 | 
 344 |     // key + value memory
 345 |     {
 346 |         const auto & hparams = model.hparams;
 347 | 
 348 |         const int n_embd  = hparams.n_embd;
 349 |         const int n_layer = hparams.n_layer;
 350 |         const int n_ctx   = hparams.n_ctx;
 351 | 
 352 |         const int n_mem      = n_layer*n_ctx;
 353 |         const int n_elements = n_embd*n_mem;
 354 | 
 355 |         model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
 356 |         model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, n_elements);
 357 | 
 358 |         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 359 | #if DEBUG
 360 |         fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
 361 | #endif // DEBUG
 362 |     }
 363 | 
 364 |     const size_t file_offset = fin_tellg(model.mbuf);
 365 | 
 366 |     //fin_close();
 367 | 
 368 |     std::vector<uint8_t> tmp;
 369 | 
 370 |     for (int i = 0; i < n_parts; ++i) {
 371 |         const int part_id = i;
 372 |         //const int part_id = n_parts - i - 1;
 373 | 
 374 |         std::string fname_part = fname;
 375 |         if (i > 0) {
 376 |             fname_part += "." + std::to_string(i);
 377 |         }
 378 | 
 379 |         //fin_init(fname_part.c_str());
 380 |         fin_seekg(model.mbuf, file_offset);
 381 | 
 382 |         // load weights
 383 |         {
 384 |             int n_tensors = 0;
 385 |             size_t total_size = 0;
 386 | 
 387 | #if DEBUG
 388 |             fprintf(stderr, "%s: ", __func__);
 389 | #endif // DEBUG
 390 | 
 391 |             while (true) {
 392 |                 int32_t n_dims;
 393 |                 int32_t length;
 394 |                 int32_t ftype;
 395 | 
 396 |                 fin_read(model.mbuf, reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
 397 |                 fin_read(model.mbuf, reinterpret_cast<char *>(&length), sizeof(length));
 398 |                 fin_read(model.mbuf, reinterpret_cast<char *>(&ftype),  sizeof(ftype));
 399 | 
 400 |                 if (fin_eof(model.mbuf)) {
 401 |                     break;
 402 |                 }
 403 | 
 404 |                 int32_t nelements = 1;
 405 |                 int32_t ne[2] = { 1, 1 };
 406 |                 for (int i = 0; i < n_dims; ++i) {
 407 |                     fin_read(model.mbuf, reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
 408 |                     nelements *= ne[i];
 409 |                 }
 410 | 
 411 |                 std::string name(length, 0);
 412 |                 fin_read(model.mbuf, &name[0], length);
 413 | 
 414 |                 if (model.tensors.find(name.data()) == model.tensors.end()) {
 415 | #if DEBUG
 416 |                     fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
 417 | #endif // DEBUG
 418 |                     return false;
 419 |                 }
 420 | 
 421 |                 // split_type = 0: split by columns
 422 |                 // split_type = 1: split by rows
 423 |                 int split_type = 0;
 424 | 
 425 |                 // split_type = 0:
 426 |                 // regex:
 427 |                 //   - tok_embeddings.*
 428 |                 //   - layers.*.attention.wo.weight
 429 |                 //   - layers.*.feed_forward.w2.weight
 430 | 
 431 |                 // split_type = 1:
 432 |                 // regex:
 433 |                 //   - output.*
 434 |                 //   - layers.*.attention.wq.weight
 435 |                 //   - layers.*.attention.wk.weight
 436 |                 //   - layers.*.attention.wv.weight
 437 |                 //   - layers.*.feed_forward.w1.weight
 438 |                 //   - layers.*.feed_forward.w3.weight
 439 |                 if (name.find("tok_embeddings") != std::string::npos) {
 440 |                     split_type = 0;
 441 |                 } else if (name.find("layers") != std::string::npos) {
 442 |                     if (name.find("attention.wo.weight") != std::string::npos) {
 443 |                         split_type = 0;
 444 |                     } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
 445 |                         split_type = 0;
 446 |                     } else {
 447 |                         split_type = 1;
 448 |                     }
 449 |                 } else if (name.find("output") != std::string::npos) {
 450 |                     split_type = 1;
 451 |                 }
 452 | 
 453 |                 auto tensor = model.tensors[name.data()];
 454 | 
 455 |                 if (n_dims == 1) {
 456 |                     if (ggml_nelements(tensor) != nelements) {
 457 | #if DEBUG
 458 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
 459 | #endif // DEBUG
 460 |                         return false;
 461 |                     }
 462 |                 } else {
 463 |                     if (ggml_nelements(tensor)/n_parts != nelements) {
 464 | #if DEBUG
 465 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
 466 | #endif // DEBUG
 467 |                         return false;
 468 |                     }
 469 |                 }
 470 | 
 471 |                 if (n_dims == 1) {
 472 |                     if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
 473 | #if DEBUG
 474 |                         fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 475 |                                 __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
 476 | #endif // DEBUG
 477 |                         return false;
 478 |                     }
 479 |                 } else {
 480 |                     if (split_type == 0) {
 481 |                         if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
 482 | #if DEBUG
 483 |                             fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 484 |                                     __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
 485 | #endif // DEBUG
 486 |                             return false;
 487 |                         }
 488 |                     } else {
 489 |                         if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
 490 | #if DEBUG
 491 |                             fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 492 |                                     __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
 493 | #endif // DEBUG
 494 |                             return false;
 495 |                         }
 496 |                     }
 497 |                 }
 498 | 
 499 | #if DEBUG
 500 |                 if (0) {
 501 |                     static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
 502 |                     fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
 503 |                 }
 504 | #endif // DEBUG
 505 | 
 506 |                 size_t bpe = 0;
 507 | 
 508 |                 switch (ftype) {
 509 |                     case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
 510 |                     case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
 511 |                     case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
 512 |                     case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
 513 |                     default:
 514 |                     {
 515 | #if DEBUG
 516 |                         fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
 517 | #endif // DEBUG
 518 |                         return false;
 519 |                     }
 520 |                 };
 521 | 
 522 |                 if (n_dims == 1 || n_parts == 1) {
 523 |                     if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
 524 | #if DEBUG
 525 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
 526 |                                 __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
 527 | #endif // DEBUG
 528 |                         return false;
 529 |                     }
 530 | 
 531 |                     if (part_id == 0) {
 532 |                         //change here to enable mmap load
 533 |                         //fin_read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
 534 |                         //fin_read_dummy((char**)&tensor->data, ggml_nbytes(tensor));
 535 |                         size_t len = ggml_nbytes(tensor);
 536 |                         fin_read_dummy(model.mbuf, (char**)&tensor->data, len);
 537 | 
 538 |                     } else {
 539 |                         fin_seekg(model.mbuf, ggml_nbytes(tensor));
 540 |                     }
 541 | 
 542 |                     total_size += ggml_nbytes(tensor);
 543 |                 } else {
 544 |                     if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
 545 | #if DEBUG
 546 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
 547 |                                 __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
 548 | #endif // DEBUG
 549 |                         return false;
 550 |                     }
 551 | 
 552 |                     if (split_type == 0) {
 553 |                         const int np0 = ne[0];
 554 | 
 555 |                         const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
 556 |                         assert(row_size == tensor->nb[1]);
 557 | 
 558 |                         for (int i1 = 0; i1 < ne[1]; ++i1) {
 559 |                             const size_t offset_row = i1*row_size;
 560 |                             const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
 561 |                             fin_read(model.mbuf, reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
 562 |                         }
 563 |                     } else {
 564 |                         const int np1 = ne[1];
 565 | 
 566 |                         const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
 567 | 
 568 |                         for (int i1 = 0; i1 < ne[1]; ++i1) {
 569 |                             const size_t offset_row = (i1 + part_id*np1)*row_size;
 570 |                             fin_read(model.mbuf, reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
 571 |                         }
 572 |                     }
 573 | 
 574 |                     total_size += ggml_nbytes(tensor)/n_parts;
 575 |                 }
 576 | 
 577 | #if DEBUG
 578 |                 //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
 579 |                 if (++n_tensors % 8 == 0) {
 580 |                     fprintf(stderr, ".");
 581 |                     fflush(stderr);
 582 |                 }
 583 | #endif // DEBUG
 584 |             }
 585 | 
 586 | #if DEBUG
 587 |             fprintf(stderr, " done\n");
 588 | 
 589 |             fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
 590 | #endif // DEBUG
 591 |         }
 592 | 
 593 |         //fin_close(); //can't unmap here
 594 |     }
 595 | 
 596 |     return true;
 597 | }
 598 | 
 599 | // load the model's weights from a file
 600 | bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
 601 | #if DEBUG
 602 |     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 603 | #endif // DEBUG
 604 | 
 605 |     std::vector<char> f_buf(1024*1024);
 606 | 
 607 |     auto fin = std::ifstream(fname, std::ios::binary);
 608 |     fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
 609 |     if (!fin) {
 610 | #if DEBUG
 611 |         fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
 612 | #endif // DEBUG
 613 |         return false;
 614 |     }
 615 | 
 616 |     // verify magic
 617 |     {
 618 |         uint32_t magic;
 619 |         fin.read((char *) &magic, sizeof(magic));
 620 |         if (magic != 0x67676d6c) {
 621 | #if DEBUG
 622 |             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
 623 | #endif // DEBUG
 624 |             return false;
 625 |         }
 626 |     }
 627 | 
 628 |     int n_ff = 0;
 629 |     int n_parts = 0;
 630 | 
 631 |     // load hparams
 632 |     {
 633 |         auto & hparams = model.hparams;
 634 | 
 635 |         fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
 636 |         //fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
 637 |         fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
 638 |         fin.read((char *) &hparams.n_mult,  sizeof(hparams.n_mult));
 639 |         fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
 640 |         fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
 641 |         fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
 642 |         fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
 643 | 
 644 |         hparams.n_ctx = n_ctx;
 645 | 
 646 |         n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
 647 |         n_parts = LLAMA_N_PARTS.at(hparams.n_embd);
 648 | 
 649 |         // fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
 650 |         // fprintf(stderr, "%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
 651 |         // fprintf(stderr, "%s: n_embd  = %d\n", __func__, hparams.n_embd);
 652 |         // fprintf(stderr, "%s: n_mult  = %d\n", __func__, hparams.n_mult);
 653 |         // fprintf(stderr, "%s: n_head  = %d\n", __func__, hparams.n_head);
 654 |         // fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
 655 |         // fprintf(stderr, "%s: n_rot   = %d\n", __func__, hparams.n_rot);
 656 |         // fprintf(stderr, "%s: f16     = %d\n", __func__, hparams.f16);
 657 |         // fprintf(stderr, "%s: n_ff    = %d\n", __func__, n_ff);
 658 |         // fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
 659 |     }
 660 | 
 661 |     // load vocab
 662 |     {
 663 |         const int32_t n_vocab = model.hparams.n_vocab;
 664 | 
 665 |         if (n_vocab != model.hparams.n_vocab) {
 666 | #if DEBUG
 667 |             fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
 668 |                     __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
 669 | #endif // DEBUG
 670 |             return false;
 671 |         }
 672 | 
 673 |         std::string word;
 674 |         for (int i = 0; i < n_vocab; i++) {
 675 |             uint32_t len;
 676 |             fin.read((char *) &len, sizeof(len));
 677 | 
 678 |             word.resize(len);
 679 |             fin.read((char *) word.data(), len);
 680 | 
 681 |             vocab.token_to_id[word] = i;
 682 |             vocab.id_to_token[i] = word;
 683 | 
 684 |             //if (i < 30000) {
 685 |             //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
 686 |             //}
 687 |         }
 688 |     }
 689 | 
 690 |     // for the big tensors, we have the option to store the data in 16-bit floats or quantized
 691 |     // in order to save memory and also to speed up the computation
 692 |     ggml_type wtype = GGML_TYPE_COUNT;
 693 |     switch (model.hparams.f16) {
 694 |         case 0: wtype = GGML_TYPE_F32;  break;
 695 |         case 1: wtype = GGML_TYPE_F16;  break;
 696 |         case 2: wtype = GGML_TYPE_Q4_0; break;
 697 |         case 3: wtype = GGML_TYPE_Q4_1; break;
 698 |         default:
 699 |         {
 700 | #if DEBUG
 701 |             fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
 702 |                     __func__, fname.c_str(), model.hparams.f16);
 703 | #endif // DEBUG
 704 |             return false;
 705 |         }
 706 |     }
 707 | 
 708 |     const ggml_type wtype2 = GGML_TYPE_F32;
 709 | 
 710 |     auto & ctx = model.ctx;
 711 | 
 712 |     size_t ctx_size = 0;
 713 | 
 714 |     {
 715 |         const auto & hparams = model.hparams;
 716 | 
 717 |         const int n_embd  = hparams.n_embd;
 718 |         const int n_layer = hparams.n_layer;
 719 |         const int n_ctx   = hparams.n_ctx;
 720 |         const int n_vocab = hparams.n_vocab;
 721 | 
 722 |         ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings
 723 | 
 724 |         ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm
 725 | 
 726 |         ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output
 727 | 
 728 |         ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm
 729 | 
 730 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq
 731 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk
 732 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv
 733 |         ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo
 734 | 
 735 |         ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm
 736 | 
 737 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1
 738 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2
 739 |         ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3
 740 | 
 741 |         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
 742 |         ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
 743 | 
 744 |         ctx_size += (5 + 10*n_layer)*256; // object overhead
 745 | 
 746 | #if DEBUG
 747 |         fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
 748 | #endif // DEBUG
 749 |     }
 750 | 
 751 |     // create the ggml context
 752 |     {
 753 |         struct ggml_init_params params = {
 754 |             /*.mem_size   =*/ ctx_size,
 755 |             /*.mem_buffer =*/ NULL,
 756 |         };
 757 | 
 758 |         model.ctx = ggml_init(params);
 759 |         if (!model.ctx) {
 760 | #if DEBUG
 761 |             fprintf(stderr, "%s: ggml_init() failed\n", __func__);
 762 | #endif // DEBUG
 763 |             return false;
 764 |         }
 765 |     }
 766 | 
 767 |     // prepare memory for the weights
 768 |     {
 769 |         const auto & hparams = model.hparams;
 770 | 
 771 |         const int n_embd  = hparams.n_embd;
 772 |         const int n_layer = hparams.n_layer;
 773 |         const int n_ctx   = hparams.n_ctx;
 774 |         const int n_vocab = hparams.n_vocab;
 775 | 
 776 |         model.layers.resize(n_layer);
 777 | 
 778 |         model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
 779 | 
 780 |         model.norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 781 |         model.output = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
 782 | 
 783 |         // map by name
 784 |         model.tensors["tok_embeddings.weight"] = model.tok_embeddings;
 785 | 
 786 |         model.tensors["norm.weight"]   = model.norm;
 787 |         model.tensors["output.weight"] = model.output;
 788 | 
 789 |         for (int i = 0; i < n_layer; ++i) {
 790 |             auto & layer = model.layers[i];
 791 | 
 792 |             layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 793 | 
 794 |             layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
 795 |             layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
 796 |             layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
 797 |             layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
 798 | 
 799 |             layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
 800 | 
 801 |             layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd,   n_ff);
 802 |             layer.w2 = ggml_new_tensor_2d(ctx, wtype,   n_ff, n_embd);
 803 |             layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd,   n_ff);
 804 | 
 805 |             // map by name
 806 |             model.tensors["layers." + std::to_string(i) + ".attention_norm.weight"] = layer.attention_norm;
 807 | 
 808 |             model.tensors["layers." + std::to_string(i) + ".attention.wq.weight"] = layer.wq;
 809 |             model.tensors["layers." + std::to_string(i) + ".attention.wk.weight"] = layer.wk;
 810 |             model.tensors["layers." + std::to_string(i) + ".attention.wv.weight"] = layer.wv;
 811 |             model.tensors["layers." + std::to_string(i) + ".attention.wo.weight"] = layer.wo;
 812 | 
 813 |             model.tensors["layers." + std::to_string(i) + ".ffn_norm.weight"] = layer.ffn_norm;
 814 | 
 815 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w1.weight"] = layer.w1;
 816 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w2.weight"] = layer.w2;
 817 |             model.tensors["layers." + std::to_string(i) + ".feed_forward.w3.weight"] = layer.w3;
 818 |         }
 819 |     }
 820 | 
 821 |     // key + value memory
 822 |     {
 823 |         const auto & hparams = model.hparams;
 824 | 
 825 |         const int n_embd  = hparams.n_embd;
 826 |         const int n_layer = hparams.n_layer;
 827 |         const int n_ctx   = hparams.n_ctx;
 828 | 
 829 |         const int n_mem      = n_layer*n_ctx;
 830 |         const int n_elements = n_embd*n_mem;
 831 | 
 832 |         model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
 833 |         model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
 834 | 
 835 |         const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
 836 | 
 837 | #if DEBUG
 838 |         fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
 839 | #endif // DEBUG
 840 |     }
 841 | 
 842 |     const size_t file_offset = fin.tellg();
 843 | 
 844 |     fin.close();
 845 | 
 846 |     std::vector<uint8_t> tmp;
 847 | 
 848 |     for (int i = 0; i < n_parts; ++i) {
 849 |         const int part_id = i;
 850 |         //const int part_id = n_parts - i - 1;
 851 | 
 852 |         std::string fname_part = fname;
 853 |         if (i > 0) {
 854 |             fname_part += "." + std::to_string(i);
 855 |         }
 856 | 
 857 | #if DEBUG
 858 |         fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
 859 | #endif // DEBUG
 860 | 
 861 |         fin = std::ifstream(fname_part, std::ios::binary);
 862 |         fin.rdbuf()->pubsetbuf(f_buf.data(), f_buf.size());
 863 |         fin.seekg(file_offset);
 864 | 
 865 |         // load weights
 866 |         {
 867 |             int n_tensors = 0;
 868 |             size_t total_size = 0;
 869 | 
 870 | #if DEBUG
 871 |             fprintf(stderr, "%s: ", __func__);
 872 | #endif // DEBUG
 873 | 
 874 |             while (true) {
 875 |                 int32_t n_dims;
 876 |                 int32_t length;
 877 |                 int32_t ftype;
 878 | 
 879 |                 fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
 880 |                 fin.read(reinterpret_cast<char *>(&length), sizeof(length));
 881 |                 fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
 882 | 
 883 |                 if (fin.eof()) {
 884 |                     break;
 885 |                 }
 886 | 
 887 |                 int32_t nelements = 1;
 888 |                 int32_t ne[2] = { 1, 1 };
 889 |                 for (int i = 0; i < n_dims; ++i) {
 890 |                     fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
 891 |                     nelements *= ne[i];
 892 |                 }
 893 | 
 894 |                 std::string name(length, 0);
 895 |                 fin.read(&name[0], length);
 896 | 
 897 |                 if (model.tensors.find(name.data()) == model.tensors.end()) {
 898 | #if DEBUG
 899 |                     fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
 900 | #endif // DEBUG
 901 |                     return false;
 902 |                 }
 903 | 
 904 |                 // split_type = 0: split by columns
 905 |                 // split_type = 1: split by rows
 906 |                 int split_type = 0;
 907 | 
 908 |                 // split_type = 0:
 909 |                 // regex:
 910 |                 //   - tok_embeddings.*
 911 |                 //   - layers.*.attention.wo.weight
 912 |                 //   - layers.*.feed_forward.w2.weight
 913 | 
 914 |                 // split_type = 1:
 915 |                 // regex:
 916 |                 //   - output.*
 917 |                 //   - layers.*.attention.wq.weight
 918 |                 //   - layers.*.attention.wk.weight
 919 |                 //   - layers.*.attention.wv.weight
 920 |                 //   - layers.*.feed_forward.w1.weight
 921 |                 //   - layers.*.feed_forward.w3.weight
 922 |                 if (name.find("tok_embeddings") != std::string::npos) {
 923 |                     split_type = 0;
 924 |                 } else if (name.find("layers") != std::string::npos) {
 925 |                     if (name.find("attention.wo.weight") != std::string::npos) {
 926 |                         split_type = 0;
 927 |                     } else if (name.find("feed_forward.w2.weight") != std::string::npos) {
 928 |                         split_type = 0;
 929 |                     } else {
 930 |                         split_type = 1;
 931 |                     }
 932 |                 } else if (name.find("output") != std::string::npos) {
 933 |                     split_type = 1;
 934 |                 }
 935 | 
 936 |                 auto tensor = model.tensors[name.data()];
 937 | 
 938 |                 if (n_dims == 1) {
 939 |                     if (ggml_nelements(tensor) != nelements) {
 940 | #if DEBUG
 941 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
 942 | #endif // DEBUG
 943 |                         return false;
 944 |                     }
 945 |                 } else {
 946 |                     if (ggml_nelements(tensor)/n_parts != nelements) {
 947 | #if DEBUG
 948 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
 949 | #endif // DEBUG
 950 |                         return false;
 951 |                     }
 952 |                 }
 953 | 
 954 |                 if (n_dims == 1) {
 955 |                     if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
 956 | #if DEBUG
 957 |                         fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 958 |                                 __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
 959 | #endif // DEBUG
 960 |                         return false;
 961 |                     }
 962 |                 } else {
 963 |                     if (split_type == 0) {
 964 |                         if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {
 965 | #if DEBUG
 966 |                             fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 967 |                                     __func__, name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]);
 968 | #endif // DEBUG
 969 |                             return false;
 970 |                         }
 971 |                     } else {
 972 |                         if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {
 973 | #if DEBUG
 974 |                             fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
 975 |                                     __func__, name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]);
 976 | #endif // DEBUG
 977 |                             return false;
 978 |                         }
 979 |                     }
 980 |                 }
 981 | 
 982 | #if DEBUG
 983 |                 if (0) {
 984 |                     static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
 985 |                     fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
 986 |                 }
 987 | #endif // DEBUG
 988 | 
 989 |                 size_t bpe = 0;
 990 | 
 991 |                 switch (ftype) {
 992 |                     case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;
 993 |                     case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
 994 |                     case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
 995 |                     case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
 996 |                     default:
 997 |                     {
 998 | #if DEBUG
 999 |                         fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
1000 | #endif // DEBUG
1001 |                         return false;
1002 |                     }
1003 |                 };
1004 | 
1005 |                 if (n_dims == 1 || n_parts == 1) {
1006 |                     if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
1007 | #if DEBUG
1008 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1009 |                                 __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
1010 | #endif // DEBUG
1011 |                         return false;
1012 |                     }
1013 | 
1014 |                     if (part_id == 0) {
1015 |                         fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
1016 |                     } else {
1017 |                         fin.seekg(ggml_nbytes(tensor), std::ios::cur);
1018 |                     }
1019 | 
1020 |                     total_size += ggml_nbytes(tensor);
1021 |                 } else {
1022 |                     if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {
1023 | #if DEBUG
1024 |                         fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
1025 |                                 __func__, name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe);
1026 | #endif // DEBUG
1027 |                         return false;
1028 |                     }
1029 | 
1030 |                     if (split_type == 0) {
1031 |                         const int np0 = ne[0];
1032 | 
1033 |                         const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
1034 |                         assert(row_size == tensor->nb[1]);
1035 | 
1036 |                         for (int i1 = 0; i1 < ne[1]; ++i1) {
1037 |                             const size_t offset_row = i1*row_size;
1038 |                             const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
1039 |                             fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);
1040 |                         }
1041 |                     } else {
1042 |                         const int np1 = ne[1];
1043 | 
1044 |                         const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);
1045 | 
1046 |                         for (int i1 = 0; i1 < ne[1]; ++i1) {
1047 |                             const size_t offset_row = (i1 + part_id*np1)*row_size;
1048 |                             fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);
1049 |                         }
1050 |                     }
1051 | 
1052 |                     total_size += ggml_nbytes(tensor)/n_parts;
1053 |                 }
1054 | 
1055 | #if DEBUG
1056 |                 //fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
1057 |                 if (++n_tensors % 8 == 0) {
1058 |                     fprintf(stderr, ".");
1059 |                     fflush(stderr);
1060 |                 }
1061 | #endif // DEBUG
1062 |             }
1063 | 
1064 | #if DEBUG
1065 |             fprintf(stderr, " done\n");
1066 | 
1067 |             fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
1068 | #endif // DEBUG
1069 |         }
1070 | 
1071 |         fin.close();
1072 |     }
1073 | 
1074 |     return true;
1075 | }
1076 | 
1077 | // evaluate the transformer
1078 | //
1079 | //   - model:     the model
1080 | //   - n_threads: number of threads to use
1081 | //   - n_past:    the context size so far
1082 | //   - embd_inp:  the embeddings of the tokens in the context
1083 | //   - embd_w:    the predicted logits for the next token
1084 | //
1085 | // The GPT-J model requires about 16MB of memory per input token.
1086 | //
1087 | bool llama_eval(
1088 |         const llama_model & model,
1089 |         const int n_threads,
1090 |         const int n_past,
1091 |         const std::vector<gpt_vocab::id> & embd_inp,
1092 |               std::vector<float>         & embd_w,
1093 |               size_t                     & mem_per_token) {
1094 |     const int N = (int)embd_inp.size();
1095 | 
1096 |     const auto & hparams = model.hparams;
1097 | 
1098 |     const int n_embd  = hparams.n_embd;
1099 |     const int n_layer = hparams.n_layer;
1100 |     const int n_ctx   = hparams.n_ctx;
1101 |     const int n_head  = hparams.n_head;
1102 |     const int n_vocab = hparams.n_vocab;
1103 |     const int n_rot   = hparams.n_embd/hparams.n_head;
1104 | 
1105 |     const int d_key = n_embd/n_head;
1106 | 
1107 |     // TODO: check if this size scales with n_ctx linearly and remove constant. somehow I feel it wasn't the case
1108 |     // static size_t buf_size = hparams.n_ctx*1024*1024;
1109 |     static size_t buf_size = 512u*1024*1024;
1110 |     static void * buf = malloc(buf_size);
1111 | 
1112 |     if (mem_per_token > 0 && mem_per_token*N > buf_size) {
1113 |         const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
1114 |         //fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
1115 | 
1116 |         // reallocate
1117 |         buf_size = buf_size_new;
1118 |         buf = realloc(buf, buf_size);
1119 |         if (buf == nullptr) {
1120 | #if DEBUG
1121 |             fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
1122 | #endif // DEBUG
1123 |             return false;
1124 |         }
1125 |     }
1126 | 
1127 |     struct ggml_init_params params = {
1128 |         /*.mem_size   =*/ buf_size,
1129 |         /*.mem_buffer =*/ buf,
1130 |     };
1131 | 
1132 |     struct ggml_context * ctx0 = ggml_init(params);
1133 |     ggml_cgraph gf = {};
1134 |     gf.n_threads = n_threads;
1135 | 
1136 |     struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
1137 |     memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
1138 | 
1139 |     struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
1140 | 
1141 |     for (int il = 0; il < n_layer; ++il) {
1142 |         struct ggml_tensor * inpSA = inpL;
1143 | 
1144 |         struct ggml_tensor * cur;
1145 | 
1146 |         // norm
1147 |         {
1148 |             cur = ggml_rms_norm(ctx0, inpL);
1149 | 
1150 |             // cur = attention_norm*cur
1151 |             cur = ggml_mul(ctx0,
1152 |                            ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
1153 |                            cur);
1154 |         }
1155 | 
1156 |         // self-attention
1157 |         {
1158 |             struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
1159 |             struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
1160 |             struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
1161 | 
1162 |             // store key and value to memory
1163 |             if (N >= 1) {
1164 |                 struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
1165 |                 struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
1166 | 
1167 |                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
1168 |                 ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
1169 |             }
1170 | 
1171 |             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
1172 |             struct ggml_tensor * Q =
1173 |             ggml_permute(ctx0,
1174 |                          ggml_rope(ctx0,
1175 |                                    ggml_cpy(ctx0,
1176 |                                             Qcur,
1177 |                                             ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
1178 |                                    n_past, n_rot, 0),
1179 |                          0, 2, 1, 3);
1180 | 
1181 |             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
1182 |             struct ggml_tensor * K =
1183 |             ggml_permute(ctx0,
1184 |                          ggml_rope(ctx0,
1185 |                                    ggml_reshape_3d(ctx0,
1186 |                                                    ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
1187 |                                                    n_embd/n_head, n_head, n_past + N),
1188 |                                    n_past, n_rot, 1),
1189 |                          0, 2, 1, 3);
1190 | 
1191 |             // K * Q
1192 |             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
1193 | 
1194 |             // KQ_scaled = KQ / sqrt(n_embd/n_head)
1195 |             struct ggml_tensor * KQ_scaled =
1196 |             ggml_scale(ctx0,
1197 |                        KQ,
1198 |                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
1199 |                        );
1200 | 
1201 |             // KQ_masked = mask_past(KQ_scaled)
1202 |             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
1203 | 
1204 |             // KQ = soft_max(KQ_masked)
1205 |             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
1206 | 
1207 |             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
1208 |             struct ggml_tensor * V_trans =
1209 |             ggml_permute(ctx0,
1210 |                          ggml_reshape_3d(ctx0,
1211 |                                          ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
1212 |                                          n_embd/n_head, n_head, n_past + N),
1213 |                          1, 2, 0, 3);
1214 | 
1215 |             // KQV = transpose(V) * KQ_soft_max
1216 |             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
1217 | 
1218 |             // KQV_merged = KQV.permute(0, 2, 1, 3)
1219 |             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
1220 | 
1221 |             // cur = KQV_merged.contiguous().view(n_embd, N)
1222 |             cur = ggml_cpy(ctx0,
1223 |                            KQV_merged,
1224 |                            ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
1225 | 
1226 |             // projection (no bias)
1227 |             cur = ggml_mul_mat(ctx0,
1228 |                                model.layers[il].wo,
1229 |                                cur);
1230 |         }
1231 | 
1232 |         struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
1233 | 
1234 |         // feed-forward network
1235 |         {
1236 |             // norm
1237 |             {
1238 |                 cur = ggml_rms_norm(ctx0, inpFF);
1239 | 
1240 |                 // cur = ffn_norm*cur
1241 |                 cur = ggml_mul(ctx0,
1242 |                                ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
1243 |                                cur);
1244 |             }
1245 | 
1246 |             struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
1247 |                                                     model.layers[il].w3,
1248 |                                                     cur);
1249 | 
1250 | 
1251 |             cur = ggml_mul_mat(ctx0,
1252 |                                model.layers[il].w1,
1253 |                                cur);
1254 | 
1255 |             // SILU activation
1256 |             cur = ggml_silu(ctx0, cur);
1257 | 
1258 |             cur = ggml_mul(ctx0, cur, tmp);
1259 | 
1260 |             cur = ggml_mul_mat(ctx0,
1261 |                                model.layers[il].w2,
1262 |                                cur);
1263 |         }
1264 | 
1265 |         cur  = ggml_add(ctx0, cur, inpFF);
1266 | 
1267 |         // input for next layer
1268 |         inpL = cur;
1269 |     }
1270 | 
1271 |     // norm
1272 |     {
1273 |         inpL = ggml_rms_norm(ctx0, inpL);
1274 | 
1275 |         // inpL = norm*inpL
1276 |         inpL = ggml_mul(ctx0,
1277 |                         ggml_repeat(ctx0, model.norm, inpL),
1278 |                         inpL);
1279 |     }
1280 | 
1281 |     // lm_head
1282 |     {
1283 |         inpL = ggml_mul_mat(ctx0, model.output, inpL);
1284 |     }
1285 | 
1286 |     // logits -> probs
1287 |     //inpL = ggml_soft_max(ctx0, inpL);
1288 | 
1289 |     // run the computation
1290 |     ggml_build_forward_expand(&gf, inpL);
1291 |     ggml_graph_compute       (ctx0, &gf);
1292 | 
1293 |     //if (n_past%100 == 0) {
1294 |     //    ggml_graph_print   (&gf);
1295 |     //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
1296 |     //}
1297 | 
1298 |     //embd_w.resize(n_vocab*N);
1299 |     //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
1300 | 
1301 |     // return result for just the last token
1302 |     embd_w.resize(n_vocab);
1303 |     memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
1304 | 
1305 |     if (mem_per_token == 0) {
1306 |         mem_per_token = ggml_used_mem(ctx0)/N;
1307 |     }
1308 |     //fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));
1309 | 
1310 |     ggml_free(ctx0);
1311 | 
1312 |     return true;
1313 | }
1314 | 


--------------------------------------------------------------------------------