├── .clang-format
├── .gitattributes
├── .gitignore
├── Benchmark1.png
├── Benchmarks
├── Benchmarks.vcxproj
├── Benchmarks.vcxproj.filters
├── EigenBenchmark.cpp
├── IntrinASMDotBenchmark.cpp
├── IntrinsicSumBenchmarks.cpp
└── NumpyBenchmark.py
├── MatrixGenerator
├── MatrixGenerator.cpp
├── MatrixGenerator.vcxproj
└── MatrixGenerator.vcxproj.filters
├── MatrixMult.sln
├── MatrixMult
├── CPUUtil.cpp
├── CPUUtil.h
├── MatrixMul.cpp
├── MatrixMult.vcxproj
├── MatrixMult.vcxproj.filters
└── ThreadPool.h
├── README.md
├── benchmark.xlsx
└── run.bat
/.clang-format:
--------------------------------------------------------------------------------
1 | # SPDX-License-Identifier: GPL-2.0
2 | #
3 | # clang-format configuration file. Intended for clang-format >= 4.
4 | #
5 | # For more information, see:
6 | #
7 | # Documentation/process/clang-format.rst
8 | # https://clang.llvm.org/docs/ClangFormat.html
9 | # https://clang.llvm.org/docs/ClangFormatStyleOptions.html
10 | #
11 | ---
12 | AccessModifierOffset: -4
13 | AlignAfterOpenBracket: Align
14 | AlignConsecutiveAssignments: false
15 | AlignConsecutiveDeclarations: false
16 | #AlignEscapedNewlines: Left # Unknown to clang-format-4.0
17 | AlignOperands: true
18 | AlignTrailingComments: false
19 | AllowAllParametersOfDeclarationOnNextLine: false
20 | AllowShortBlocksOnASingleLine: false
21 | AllowShortCaseLabelsOnASingleLine: false
22 | AllowShortFunctionsOnASingleLine: None
23 | AllowShortIfStatementsOnASingleLine: false
24 | AllowShortLoopsOnASingleLine: false
25 | AlwaysBreakAfterDefinitionReturnType: None
26 | AlwaysBreakAfterReturnType: None
27 | AlwaysBreakBeforeMultilineStrings: false
28 | AlwaysBreakTemplateDeclarations: false
29 | BinPackArguments: true
30 | BinPackParameters: true
31 | BraceWrapping:
32 | AfterClass: false
33 | AfterControlStatement: false
34 | AfterEnum: false
35 | AfterFunction: true
36 | AfterNamespace: true
37 | AfterObjCDeclaration: false
38 | AfterStruct: false
39 | AfterUnion: false
40 | #AfterExternBlock: false # Unknown to clang-format-5.0
41 | BeforeCatch: false
42 | BeforeElse: false
43 | IndentBraces: false
44 | #SplitEmptyFunction: true # Unknown to clang-format-4.0
45 | #SplitEmptyRecord: true # Unknown to clang-format-4.0
46 | #SplitEmptyNamespace: true # Unknown to clang-format-4.0
47 | BreakBeforeBinaryOperators: None
48 | BreakBeforeBraces: Custom
49 | #BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
50 | BreakBeforeTernaryOperators: false
51 | BreakConstructorInitializersBeforeComma: false
52 | #BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
53 | BreakAfterJavaFieldAnnotations: false
54 | BreakStringLiterals: false
55 | ColumnLimit: 88
56 | CommentPragmas: '^ IWYU pragma:'
57 | #CompactNamespaces: false # Unknown to clang-format-4.0
58 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
59 | ConstructorInitializerIndentWidth: 4
60 | ContinuationIndentWidth: 2
61 | Cpp11BracedListStyle: true
62 | DerivePointerAlignment: false
63 | DisableFormat: false
64 | ExperimentalAutoDetectBinPacking: false
65 | #FixNamespaceComments: false # Unknown to clang-format-4.0
66 |
67 | # Taken from:
68 | # git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
69 | # | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \
70 | # | sort | uniq
71 | ForEachMacros:
72 | - 'apei_estatus_for_each_section'
73 | - 'ata_for_each_dev'
74 | - 'ata_for_each_link'
75 | - 'ax25_for_each'
76 | - 'ax25_uid_for_each'
77 | - 'bio_for_each_integrity_vec'
78 | - '__bio_for_each_segment'
79 | - 'bio_for_each_segment'
80 | - 'bio_for_each_segment_all'
81 | - 'bio_list_for_each'
82 | - 'bip_for_each_vec'
83 | - 'blkg_for_each_descendant_post'
84 | - 'blkg_for_each_descendant_pre'
85 | - 'blk_queue_for_each_rl'
86 | - 'bond_for_each_slave'
87 | - 'bond_for_each_slave_rcu'
88 | - 'btree_for_each_safe128'
89 | - 'btree_for_each_safe32'
90 | - 'btree_for_each_safe64'
91 | - 'btree_for_each_safel'
92 | - 'card_for_each_dev'
93 | - 'cgroup_taskset_for_each'
94 | - 'cgroup_taskset_for_each_leader'
95 | - 'cpufreq_for_each_entry'
96 | - 'cpufreq_for_each_entry_idx'
97 | - 'cpufreq_for_each_valid_entry'
98 | - 'cpufreq_for_each_valid_entry_idx'
99 | - 'css_for_each_child'
100 | - 'css_for_each_descendant_post'
101 | - 'css_for_each_descendant_pre'
102 | - 'device_for_each_child_node'
103 | - 'drm_atomic_crtc_for_each_plane'
104 | - 'drm_atomic_crtc_state_for_each_plane'
105 | - 'drm_atomic_crtc_state_for_each_plane_state'
106 | - 'drm_for_each_connector_iter'
107 | - 'drm_for_each_crtc'
108 | - 'drm_for_each_encoder'
109 | - 'drm_for_each_encoder_mask'
110 | - 'drm_for_each_fb'
111 | - 'drm_for_each_legacy_plane'
112 | - 'drm_for_each_plane'
113 | - 'drm_for_each_plane_mask'
114 | - 'drm_mm_for_each_hole'
115 | - 'drm_mm_for_each_node'
116 | - 'drm_mm_for_each_node_in_range'
117 | - 'drm_mm_for_each_node_safe'
118 | - 'for_each_active_drhd_unit'
119 | - 'for_each_active_iommu'
120 | - 'for_each_available_child_of_node'
121 | - 'for_each_bio'
122 | - 'for_each_board_func_rsrc'
123 | - 'for_each_bvec'
124 | - 'for_each_child_of_node'
125 | - 'for_each_clear_bit'
126 | - 'for_each_clear_bit_from'
127 | - 'for_each_cmsghdr'
128 | - 'for_each_compatible_node'
129 | - 'for_each_console'
130 | - 'for_each_cpu'
131 | - 'for_each_cpu_and'
132 | - 'for_each_cpu_not'
133 | - 'for_each_cpu_wrap'
134 | - 'for_each_dev_addr'
135 | - 'for_each_dma_cap_mask'
136 | - 'for_each_drhd_unit'
137 | - 'for_each_dss_dev'
138 | - 'for_each_efi_memory_desc'
139 | - 'for_each_efi_memory_desc_in_map'
140 | - 'for_each_endpoint_of_node'
141 | - 'for_each_evictable_lru'
142 | - 'for_each_fib6_node_rt_rcu'
143 | - 'for_each_fib6_walker_rt'
144 | - 'for_each_free_mem_range'
145 | - 'for_each_free_mem_range_reverse'
146 | - 'for_each_func_rsrc'
147 | - 'for_each_hstate'
148 | - 'for_each_if'
149 | - 'for_each_iommu'
150 | - 'for_each_ip_tunnel_rcu'
151 | - 'for_each_irq_nr'
152 | - 'for_each_lru'
153 | - 'for_each_matching_node'
154 | - 'for_each_matching_node_and_match'
155 | - 'for_each_memblock'
156 | - 'for_each_memblock_type'
157 | - 'for_each_memcg_cache_index'
158 | - 'for_each_mem_pfn_range'
159 | - 'for_each_mem_range'
160 | - 'for_each_mem_range_rev'
161 | - 'for_each_migratetype_order'
162 | - 'for_each_msi_entry'
163 | - 'for_each_net'
164 | - 'for_each_netdev'
165 | - 'for_each_netdev_continue'
166 | - 'for_each_netdev_continue_rcu'
167 | - 'for_each_netdev_feature'
168 | - 'for_each_netdev_in_bond_rcu'
169 | - 'for_each_netdev_rcu'
170 | - 'for_each_netdev_reverse'
171 | - 'for_each_netdev_safe'
172 | - 'for_each_net_rcu'
173 | - 'for_each_new_connector_in_state'
174 | - 'for_each_new_crtc_in_state'
175 | - 'for_each_new_plane_in_state'
176 | - 'for_each_new_private_obj_in_state'
177 | - 'for_each_node'
178 | - 'for_each_node_by_name'
179 | - 'for_each_node_by_type'
180 | - 'for_each_node_mask'
181 | - 'for_each_node_state'
182 | - 'for_each_node_with_cpus'
183 | - 'for_each_node_with_property'
184 | - 'for_each_of_allnodes'
185 | - 'for_each_of_allnodes_from'
186 | - 'for_each_of_pci_range'
187 | - 'for_each_old_connector_in_state'
188 | - 'for_each_old_crtc_in_state'
189 | - 'for_each_oldnew_connector_in_state'
190 | - 'for_each_oldnew_crtc_in_state'
191 | - 'for_each_oldnew_plane_in_state'
192 | - 'for_each_oldnew_private_obj_in_state'
193 | - 'for_each_old_plane_in_state'
194 | - 'for_each_old_private_obj_in_state'
195 | - 'for_each_online_cpu'
196 | - 'for_each_online_node'
197 | - 'for_each_online_pgdat'
198 | - 'for_each_pci_bridge'
199 | - 'for_each_pci_dev'
200 | - 'for_each_pci_msi_entry'
201 | - 'for_each_populated_zone'
202 | - 'for_each_possible_cpu'
203 | - 'for_each_present_cpu'
204 | - 'for_each_prime_number'
205 | - 'for_each_prime_number_from'
206 | - 'for_each_process'
207 | - 'for_each_process_thread'
208 | - 'for_each_property_of_node'
209 | - 'for_each_reserved_mem_region'
210 | - 'for_each_resv_unavail_range'
211 | - 'for_each_rtdcom'
212 | - 'for_each_rtdcom_safe'
213 | - 'for_each_set_bit'
214 | - 'for_each_set_bit_from'
215 | - 'for_each_sg'
216 | - 'for_each_sg_page'
217 | - '__for_each_thread'
218 | - 'for_each_thread'
219 | - 'for_each_zone'
220 | - 'for_each_zone_zonelist'
221 | - 'for_each_zone_zonelist_nodemask'
222 | - 'fwnode_for_each_available_child_node'
223 | - 'fwnode_for_each_child_node'
224 | - 'fwnode_graph_for_each_endpoint'
225 | - 'gadget_for_each_ep'
226 | - 'hash_for_each'
227 | - 'hash_for_each_possible'
228 | - 'hash_for_each_possible_rcu'
229 | - 'hash_for_each_possible_rcu_notrace'
230 | - 'hash_for_each_possible_safe'
231 | - 'hash_for_each_rcu'
232 | - 'hash_for_each_safe'
233 | - 'hctx_for_each_ctx'
234 | - 'hlist_bl_for_each_entry'
235 | - 'hlist_bl_for_each_entry_rcu'
236 | - 'hlist_bl_for_each_entry_safe'
237 | - 'hlist_for_each'
238 | - 'hlist_for_each_entry'
239 | - 'hlist_for_each_entry_continue'
240 | - 'hlist_for_each_entry_continue_rcu'
241 | - 'hlist_for_each_entry_continue_rcu_bh'
242 | - 'hlist_for_each_entry_from'
243 | - 'hlist_for_each_entry_from_rcu'
244 | - 'hlist_for_each_entry_rcu'
245 | - 'hlist_for_each_entry_rcu_bh'
246 | - 'hlist_for_each_entry_rcu_notrace'
247 | - 'hlist_for_each_entry_safe'
248 | - '__hlist_for_each_rcu'
249 | - 'hlist_for_each_safe'
250 | - 'hlist_nulls_for_each_entry'
251 | - 'hlist_nulls_for_each_entry_from'
252 | - 'hlist_nulls_for_each_entry_rcu'
253 | - 'hlist_nulls_for_each_entry_safe'
254 | - 'ide_host_for_each_port'
255 | - 'ide_port_for_each_dev'
256 | - 'ide_port_for_each_present_dev'
257 | - 'idr_for_each_entry'
258 | - 'idr_for_each_entry_continue'
259 | - 'idr_for_each_entry_ul'
260 | - 'inet_bind_bucket_for_each'
261 | - 'inet_lhash2_for_each_icsk_rcu'
262 | - 'iov_for_each'
263 | - 'key_for_each'
264 | - 'key_for_each_safe'
265 | - 'klp_for_each_func'
266 | - 'klp_for_each_object'
267 | - 'kvm_for_each_memslot'
268 | - 'kvm_for_each_vcpu'
269 | - 'list_for_each'
270 | - 'list_for_each_entry'
271 | - 'list_for_each_entry_continue'
272 | - 'list_for_each_entry_continue_rcu'
273 | - 'list_for_each_entry_continue_reverse'
274 | - 'list_for_each_entry_from'
275 | - 'list_for_each_entry_from_reverse'
276 | - 'list_for_each_entry_lockless'
277 | - 'list_for_each_entry_rcu'
278 | - 'list_for_each_entry_reverse'
279 | - 'list_for_each_entry_safe'
280 | - 'list_for_each_entry_safe_continue'
281 | - 'list_for_each_entry_safe_from'
282 | - 'list_for_each_entry_safe_reverse'
283 | - 'list_for_each_prev'
284 | - 'list_for_each_prev_safe'
285 | - 'list_for_each_safe'
286 | - 'llist_for_each'
287 | - 'llist_for_each_entry'
288 | - 'llist_for_each_entry_safe'
289 | - 'llist_for_each_safe'
290 | - 'media_device_for_each_entity'
291 | - 'media_device_for_each_intf'
292 | - 'media_device_for_each_link'
293 | - 'media_device_for_each_pad'
294 | - 'netdev_for_each_lower_dev'
295 | - 'netdev_for_each_lower_private'
296 | - 'netdev_for_each_lower_private_rcu'
297 | - 'netdev_for_each_mc_addr'
298 | - 'netdev_for_each_uc_addr'
299 | - 'netdev_for_each_upper_dev_rcu'
300 | - 'netdev_hw_addr_list_for_each'
301 | - 'nft_rule_for_each_expr'
302 | - 'nla_for_each_attr'
303 | - 'nla_for_each_nested'
304 | - 'nlmsg_for_each_attr'
305 | - 'nlmsg_for_each_msg'
306 | - 'nr_neigh_for_each'
307 | - 'nr_neigh_for_each_safe'
308 | - 'nr_node_for_each'
309 | - 'nr_node_for_each_safe'
310 | - 'of_for_each_phandle'
311 | - 'of_property_for_each_string'
312 | - 'of_property_for_each_u32'
313 | - 'pci_bus_for_each_resource'
314 | - 'ping_portaddr_for_each_entry'
315 | - 'plist_for_each'
316 | - 'plist_for_each_continue'
317 | - 'plist_for_each_entry'
318 | - 'plist_for_each_entry_continue'
319 | - 'plist_for_each_entry_safe'
320 | - 'plist_for_each_safe'
321 | - 'pnp_for_each_card'
322 | - 'pnp_for_each_dev'
323 | - 'protocol_for_each_card'
324 | - 'protocol_for_each_dev'
325 | - 'queue_for_each_hw_ctx'
326 | - 'radix_tree_for_each_slot'
327 | - 'radix_tree_for_each_tagged'
328 | - 'rbtree_postorder_for_each_entry_safe'
329 | - 'resource_list_for_each_entry'
330 | - 'resource_list_for_each_entry_safe'
331 | - 'rhl_for_each_entry_rcu'
332 | - 'rhl_for_each_rcu'
333 | - 'rht_for_each'
334 | - 'rht_for_each_continue'
335 | - 'rht_for_each_entry'
336 | - 'rht_for_each_entry_continue'
337 | - 'rht_for_each_entry_rcu'
338 | - 'rht_for_each_entry_rcu_continue'
339 | - 'rht_for_each_entry_safe'
340 | - 'rht_for_each_rcu'
341 | - 'rht_for_each_rcu_continue'
342 | - '__rq_for_each_bio'
343 | - 'rq_for_each_segment'
344 | - 'scsi_for_each_prot_sg'
345 | - 'scsi_for_each_sg'
346 | - 'sctp_for_each_hentry'
347 | - 'sctp_skb_for_each'
348 | - 'shdma_for_each_chan'
349 | - '__shost_for_each_device'
350 | - 'shost_for_each_device'
351 | - 'sk_for_each'
352 | - 'sk_for_each_bound'
353 | - 'sk_for_each_entry_offset_rcu'
354 | - 'sk_for_each_from'
355 | - 'sk_for_each_rcu'
356 | - 'sk_for_each_safe'
357 | - 'sk_nulls_for_each'
358 | - 'sk_nulls_for_each_from'
359 | - 'sk_nulls_for_each_rcu'
360 | - 'snd_pcm_group_for_each_entry'
361 | - 'snd_soc_dapm_widget_for_each_path'
362 | - 'snd_soc_dapm_widget_for_each_path_safe'
363 | - 'snd_soc_dapm_widget_for_each_sink_path'
364 | - 'snd_soc_dapm_widget_for_each_source_path'
365 | - 'tb_property_for_each'
366 | - 'udp_portaddr_for_each_entry'
367 | - 'udp_portaddr_for_each_entry_rcu'
368 | - 'usb_hub_for_each_child'
369 | - 'v4l2_device_for_each_subdev'
370 | - 'v4l2_m2m_for_each_dst_buf'
371 | - 'v4l2_m2m_for_each_dst_buf_safe'
372 | - 'v4l2_m2m_for_each_src_buf'
373 | - 'v4l2_m2m_for_each_src_buf_safe'
374 | - 'zorro_for_each_dev'
375 |
376 | #IncludeBlocks: Preserve # Unknown to clang-format-5.0
377 | IncludeCategories:
378 | - Regex: '.*'
379 | Priority: 1
380 | IncludeIsMainRegex: '(Test)?$'
381 | IndentCaseLabels: false
382 | #IndentPPDirectives: None # Unknown to clang-format-5.0
383 | IndentWidth: 4
384 | IndentWrappedFunctionNames: false
385 | JavaScriptQuotes: Leave
386 | JavaScriptWrapImports: true
387 | KeepEmptyLinesAtTheStartOfBlocks: false
388 | MacroBlockBegin: ''
389 | MacroBlockEnd: ''
390 | MaxEmptyLinesToKeep: 1
391 | NamespaceIndentation: All
392 | #ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
393 | ObjCBlockIndentWidth: 4
394 | ObjCSpaceAfterProperty: true
395 | ObjCSpaceBeforeProtocolList: true
396 |
397 | # Taken from git's rules
398 | #PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
399 | PenaltyBreakBeforeFirstCallParameter: 30
400 | PenaltyBreakComment: 10
401 | PenaltyBreakFirstLessLess: 0
402 | PenaltyBreakString: 10
403 | PenaltyExcessCharacter: 100
404 | PenaltyReturnTypeOnItsOwnLine: 60
405 |
406 | PointerAlignment: Left
407 | ReflowComments: false
408 | SortIncludes: false
409 | #SortUsingDeclarations: false # Unknown to clang-format-4.0
410 | SpaceAfterCStyleCast: false
411 | SpaceAfterTemplateKeyword: true
412 | SpaceBeforeAssignmentOperators: true
413 | #SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
414 | #SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
415 | SpaceBeforeParens: ControlStatements
416 | #SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
417 | SpaceInEmptyParentheses: false
418 | SpacesBeforeTrailingComments: 1
419 | SpacesInAngles: false
420 | SpacesInContainerLiterals: false
421 | SpacesInCStyleCastParentheses: false
422 | SpacesInParentheses: false
423 | SpacesInSquareBrackets: false
424 | Standard: Cpp11
425 | TabWidth: 4
426 | UseTab: Never
427 | ...
428 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ## Ignore Visual Studio temporary files, build results, and
2 | ## files generated by popular Visual Studio add-ons.
3 |
4 | # User-specific files
5 | *.suo
6 | *.user
7 | *.userosscache
8 | *.sln.docstates
9 |
10 | # User-specific files (MonoDevelop/Xamarin Studio)
11 | *.userprefs
12 |
13 | # Build results
14 | [Dd]ebug/
15 | [Dd]ebugPublic/
16 | [Rr]elease/
17 | [Rr]eleases/
18 | x64/
19 | x86/
20 | bld/
21 | [Bb]in/
22 | [Oo]bj/
23 | [Ll]og/
24 |
25 | # Visual Studio 2015 cache/options directory
26 | .vs/
27 | # Uncomment if you have tasks that create the project's static files in wwwroot
28 | #wwwroot/
29 |
30 | # MSTest test Results
31 | [Tt]est[Rr]esult*/
32 | [Bb]uild[Ll]og.*
33 |
34 | # NUNIT
35 | *.VisualState.xml
36 | TestResult.xml
37 |
38 | # Build Results of an ATL Project
39 | [Dd]ebugPS/
40 | [Rr]eleasePS/
41 | dlldata.c
42 |
43 | # DNX
44 | project.lock.json
45 | project.fragment.lock.json
46 | artifacts/
47 |
48 | *_i.c
49 | *_p.c
50 | *_i.h
51 | *.ilk
52 | *.meta
53 | *.obj
54 | *.pch
55 | *.pdb
56 | *.pgc
57 | *.pgd
58 | *.rsp
59 | *.sbr
60 | *.tlb
61 | *.tli
62 | *.tlh
63 | *.tmp
64 | *.tmp_proj
65 | *.log
66 | *.vspscc
67 | *.vssscc
68 | .builds
69 | *.pidb
70 | *.svclog
71 | *.scc
72 |
73 | # Chutzpah Test files
74 | _Chutzpah*
75 |
76 | # Visual C++ cache files
77 | ipch/
78 | *.aps
79 | *.ncb
80 | *.opendb
81 | *.opensdf
82 | *.sdf
83 | *.cachefile
84 | *.VC.db
85 | *.VC.VC.opendb
86 |
87 | # Visual Studio profiler
88 | *.psess
89 | *.vsp
90 | *.vspx
91 | *.sap
92 |
93 | # TFS 2012 Local Workspace
94 | $tf/
95 |
96 | # Guidance Automation Toolkit
97 | *.gpState
98 |
99 | # ReSharper is a .NET coding add-in
100 | _ReSharper*/
101 | *.[Rr]e[Ss]harper
102 | *.DotSettings.user
103 |
104 | # JustCode is a .NET coding add-in
105 | .JustCode
106 |
107 | # TeamCity is a build add-in
108 | _TeamCity*
109 |
110 | # DotCover is a Code Coverage Tool
111 | *.dotCover
112 |
113 | # NCrunch
114 | _NCrunch_*
115 | .*crunch*.local.xml
116 | nCrunchTemp_*
117 |
118 | # MightyMoose
119 | *.mm.*
120 | AutoTest.Net/
121 |
122 | # Web workbench (sass)
123 | .sass-cache/
124 |
125 | # Installshield output folder
126 | [Ee]xpress/
127 |
128 | # DocProject is a documentation generator add-in
129 | DocProject/buildhelp/
130 | DocProject/Help/*.HxT
131 | DocProject/Help/*.HxC
132 | DocProject/Help/*.hhc
133 | DocProject/Help/*.hhk
134 | DocProject/Help/*.hhp
135 | DocProject/Help/Html2
136 | DocProject/Help/html
137 |
138 | # Click-Once directory
139 | publish/
140 |
141 | # Publish Web Output
142 | *.[Pp]ublish.xml
143 | *.azurePubxml
144 | # TODO: Comment the next line if you want to checkin your web deploy settings
145 | # but database connection strings (with potential passwords) will be unencrypted
146 | #*.pubxml
147 | *.publishproj
148 |
149 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
150 | # checkin your Azure Web App publish settings, but sensitive information contained
151 | # in these scripts will be unencrypted
152 | PublishScripts/
153 |
154 | # NuGet Packages
155 | *.nupkg
156 | # The packages folder can be ignored because of Package Restore
157 | **/packages/*
158 | # except build/, which is used as an MSBuild target.
159 | !**/packages/build/
160 | # Uncomment if necessary however generally it will be regenerated when needed
161 | #!**/packages/repositories.config
162 | # NuGet v3's project.json files produces more ignoreable files
163 | *.nuget.props
164 | *.nuget.targets
165 |
166 | # Microsoft Azure Build Output
167 | csx/
168 | *.build.csdef
169 |
170 | # Microsoft Azure Emulator
171 | ecf/
172 | rcf/
173 |
174 | # Windows Store app package directories and files
175 | AppPackages/
176 | BundleArtifacts/
177 | Package.StoreAssociation.xml
178 | _pkginfo.txt
179 |
180 | # Visual Studio cache files
181 | # files ending in .cache can be ignored
182 | *.[Cc]ache
183 | # but keep track of directories ending in .cache
184 | !*.[Cc]ache/
185 |
186 | # Others
187 | ClientBin/
188 | ~$*
189 | *~
190 | *.dbmdl
191 | *.dbproj.schemaview
192 | *.jfm
193 | *.pfx
194 | *.publishsettings
195 | node_modules/
196 | orleans.codegen.cs
197 |
198 | # Since there are multiple workflows, uncomment next line to ignore bower_components
199 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
200 | #bower_components/
201 |
202 | # RIA/Silverlight projects
203 | Generated_Code/
204 |
205 | # Backup & report files from converting an old project file
206 | # to a newer Visual Studio version. Backup files are not needed,
207 | # because we have git ;-)
208 | _UpgradeReport_Files/
209 | Backup*/
210 | UpgradeLog*.XML
211 | UpgradeLog*.htm
212 |
213 | # SQL Server files
214 | *.mdf
215 | *.ldf
216 |
217 | # Business Intelligence projects
218 | *.rdl.data
219 | *.bim.layout
220 | *.bim_*.settings
221 |
222 | # Microsoft Fakes
223 | FakesAssemblies/
224 |
225 | # GhostDoc plugin setting file
226 | *.GhostDoc.xml
227 |
228 | # Node.js Tools for Visual Studio
229 | .ntvs_analysis.dat
230 |
231 | # Visual Studio 6 build log
232 | *.plg
233 |
234 | # Visual Studio 6 workspace options file
235 | *.opt
236 |
237 | # Visual Studio LightSwitch build output
238 | **/*.HTMLClient/GeneratedArtifacts
239 | **/*.DesktopClient/GeneratedArtifacts
240 | **/*.DesktopClient/ModelManifest.xml
241 | **/*.Server/GeneratedArtifacts
242 | **/*.Server/ModelManifest.xml
243 | _Pvt_Extensions
244 |
245 | # Paket dependency manager
246 | .paket/paket.exe
247 | paket-files/
248 |
249 | # FAKE - F# Make
250 | .fake/
251 |
252 | # JetBrains Rider
253 | .idea/
254 | *.sln.iml
255 |
256 | # CodeRush
257 | .cr/
258 |
259 | # Python Tools for Visual Studio (PTVS)
260 | __pycache__/
261 | *.pyc
262 | /MatrixMult/matrixB.bin
263 | /MatrixMult/matrixA.bin
264 | /matrixB.bin
265 | /matrixAB.bin
266 | /matrixA.bin
267 | /MatrixMult/matrixAB.bin
268 | /MatrixMult/matrixAB-out.bin
269 | /Benchmarks/My Inspector Results - Benchmarks/My Inspector Results - Benchmarks.inspxeproj
270 | /matrixAB-out.bin
271 | /MatrixMult/My Advisor Results - MatrixMult
272 | /MatrixMult/My Amplifier Results - MatrixMult
273 | /Benchmarks/matrixB.bin
274 | /Benchmarks/matrixAB.bin
275 | /Benchmarks/matrixA.bin
276 | /Benchmarks/My Advisor Results - Benchmarks
277 | /Benchmarks/My Amplifier Results - Benchmarks
278 | /MatrixGenerator/My Amplifier Results - MatrixGenerator
279 | /MatrixMult/matrixB11000.bin
280 | /MatrixMult/matrixAB11000.bin
281 | /MatrixMult/matrixA11000.bin
282 | /MatrixMult/matrixB9000.bin
283 | /MatrixMult/matrixA9000.bin
284 | /MatrixMult/My Inspector Results - MatrixMult
285 | /MatrixMult/matrixB1000.bin
286 | /MatrixMult/matrixA1000.bin
287 | /MatrixMult/matrixBx.bin
288 | /MatrixMult/matrixAx.bin
289 | /MatrixMult/matrixABx.bin
290 |
--------------------------------------------------------------------------------
/Benchmark1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talhasaruhan/cpp-matmul/e1ef1edf935d5af6d79de15b127d1e8ad13f284c/Benchmark1.png
--------------------------------------------------------------------------------
/Benchmarks/Benchmarks.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 | Debug
14 | x64
15 |
16 |
17 | Release
18 | x64
19 |
20 |
21 |
22 | 15.0
23 | {5895928A-FD77-4426-9588-36399A75D082}
24 | Benchmarks
25 | 10.0.16299.0
26 |
27 |
28 |
29 | Application
30 | true
31 | v141
32 | MultiByte
33 |
34 |
35 | Application
36 | false
37 | v141
38 | true
39 | MultiByte
40 |
41 |
42 | Application
43 | true
44 | v141
45 | MultiByte
46 |
47 |
48 | Application
49 | false
50 | v141
51 | true
52 | MultiByte
53 | true
54 | Parallel
55 | true
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 | $(ExecutablePath)
77 | $(SourcePath)
78 |
79 |
80 |
81 | Level3
82 | MaxSpeed
83 | true
84 | true
85 | true
86 | true
87 | C:\eigen;
88 | Speed
89 | true
90 | AdvancedVectorExtensions2
91 | Fast
92 | true
93 | /DMKL_ILP64 -I"%MKLROOT%"\include %(AdditionalOptions)
94 | MultiThreaded
95 | true
96 | true
97 | true
98 | No
99 | false
100 | false
101 |
102 |
103 | true
104 | true
105 | mkl_intel_ilp64.lib; mkl_tbb_thread.lib; mkl_core.lib; tbb.lib
106 | C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2019.0.117\windows\mkl\lib\intel64_win;C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2019\windows\tbb\lib\intel64_win\vc14_uwp
107 | /DMKL_ILP64 -I"%MKLROOT%"\include %(AdditionalOptions)
108 | Console
109 |
110 |
111 |
112 |
113 | Level3
114 | Disabled
115 | true
116 | true
117 |
118 |
119 |
120 |
121 | Level3
122 | Disabled
123 | true
124 | true
125 |
126 |
127 |
128 |
129 | Level3
130 | MaxSpeed
131 | true
132 | true
133 | true
134 | true
135 |
136 |
137 | true
138 | true
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
--------------------------------------------------------------------------------
/Benchmarks/Benchmarks.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 | Source Files
23 |
24 |
25 | Source Files
26 |
27 |
28 |
--------------------------------------------------------------------------------
/Benchmarks/EigenBenchmark.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | #define EIGEN_USE_MKL_ALL
9 | #include
10 |
11 | using namespace std;
12 | using namespace Eigen;
13 |
14 | int main(int argc, char* argv[])
15 | {
16 | int K;
17 | if (argc == 1) {
18 | K = 10000;
19 | } else if (argc == 2) {
20 | /* 2 NxN */
21 | K = atoi(argv[1]);
22 | assert(K > 0);
23 | }
24 |
25 | mkl_set_num_threads(12);
26 | setNbThreads(12);
27 |
28 | MatrixXd matA = MatrixXd::Random(K, K);
29 | MatrixXd matB = MatrixXd::Random(K, K);
30 |
31 | auto start = std::chrono::high_resolution_clock::now();
32 | MatrixXd matC = matA * matB;
33 | auto end = std::chrono::high_resolution_clock::now();
34 |
35 | std::cout
36 | << "Matrix Multiplication: "
37 | << std::chrono::duration_cast(end - start).count()
38 | << " microseconds.\n";
39 | }
40 |
--------------------------------------------------------------------------------
/Benchmarks/IntrinASMDotBenchmark.cpp:
--------------------------------------------------------------------------------
1 | //#include
2 | //#include
3 | //#include
4 | //#include
5 | //#include
6 | //#include
7 | //#include
8 | //#include
9 | //#include
10 | //#include
11 | //#include
12 | //
13 | //using namespace std;
14 | //
15 | //#define AVX_ALIGNMENT 32
16 | //
17 | //float VecDotIntrinsicExplicit1(float* const a, float* const b, const unsigned N)
18 | //{
19 | // float* vsum = (float*)aligned_alloc(8 * sizeof(float), AVX_ALIGNMENT);
20 | // for (int i = 0; i<8; ++i) vsum[i] = 0;
21 | //
22 | // __m256 sum = _mm256_setzero_ps();
23 | // __m256 a1, a2, a3, a4, a5, a6, a7, a8;
24 | // __m256 b1, b2, b3, b4, b5, b6, b7, b8;
25 | //
26 | // for (int i = 0; i(end - start).count() << " milliseconds.\n";
327 | //
328 | // /*****************************************************/
329 | //
330 | // cout << t1 << endl;
331 | //}
332 | //
333 | ////int main() {
334 | //// ILPSum();
335 | ////}
--------------------------------------------------------------------------------
/Benchmarks/IntrinsicSumBenchmarks.cpp:
--------------------------------------------------------------------------------
1 | //#include
2 | //#include
3 | //#include
4 | //#include
5 | //#include
6 | //#include
7 | //#include
8 | //
9 | //using namespace std;
10 | //
11 | //#define AVX_ALIGNMENT 32
12 | //
13 | ///* naive sum using intrinsics */
14 | //float VecSumIntrinsicNaiveLoop(const float* const __restrict c, const unsigned N)
15 | //{
16 | // _declspec(align(32)) float vsum[8];
17 | // for (int i = 0; i<8; ++i) vsum[i] = 0;
18 | //
19 | // __m256 sum = _mm256_setzero_ps();
20 | // __m256 x0, x1;
21 | //
22 | // for (int i = 0; i> 1;
282 | // _mm256_store_ps(&c[j + 0], c1);
283 | // _mm256_store_ps(&c[j + 8], c2);
284 | // _mm256_store_ps(&c[j + 16], c3);
285 | // _mm256_store_ps(&c[j + 24], c4);
286 | // _mm256_store_ps(&c[j + 32], c5);
287 | // _mm256_store_ps(&c[j + 40], c6);
288 | // _mm256_store_ps(&c[j + 48], c7);
289 | // _mm256_store_ps(&c[j + 56], c8);
290 | // }
291 | // }
292 | //
293 | // return VecSumIntrinsicNaiveLoop(c, 64);
294 | //}
295 | //
296 | ///* scalar sum */
297 | //float VecSumScalarAccumulate(const float* const __restrict c, const unsigned N) {
298 | // /*
299 | // * compiler optimizes this by keeping t in an xmm register
300 | // * s.t at every iteration, we do 1 load and 1 add
301 | // * but t <- add(t, ai) is obviously dependent on t
302 | // * so there goes the ILP.
303 | // */
304 | //
305 | // float t = 0;
306 | // for (int i = 0; i(end - start).count() << " milliseconds.\n";
350 | //
351 | // /*****************************************************/
352 | //
353 | // //memcpy(ar_cpy, ar, N * sizeof(float));
354 | //
355 | // //start = std::chrono::high_resolution_clock::now();
356 | // //t2 = VecSumScalarBinary(ar_cpy, N, K);
357 | // //end = std::chrono::high_resolution_clock::now();
358 | // //std::cout << "C++ Binary sum: " << std::chrono::duration_cast(end - start).count() << " milliseconds.\n";
359 | //
360 | // /*****************************************************/
361 | //
362 | // start = std::chrono::high_resolution_clock::now();
363 | // for (int i = 0; i(end - start).count() << " milliseconds.\n";
367 | //
368 | // /*****************************************************/
369 | //
370 | // start = std::chrono::high_resolution_clock::now();
371 | // for (int i = 0; i(end - start).count() << " milliseconds.\n";
375 | //
376 | // /*****************************************************/
377 | //
378 | // start = std::chrono::high_resolution_clock::now();
379 | // for (int i = 0; i(end - start).count() << " milliseconds.\n";
383 | //
384 | // /*****************************************************/
385 | //
386 | // start = std::chrono::high_resolution_clock::now();
387 | // for (int i = 0; i(end - start).count() << " milliseconds.\n";
391 | //
392 | // /*****************************************************/
393 | //
394 | // cout << t1 << endl;
395 | // cout << t3 << endl;
396 | // cout << t4 << endl;
397 | // cout << t5 << endl;
398 | // cout << t6 << endl;
399 | //}
400 | //
401 | //int main() {
402 | // ILPSum();
403 | //}
--------------------------------------------------------------------------------
/Benchmarks/NumpyBenchmark.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 |
4 | n = 1000
5 | a = np.random.randn(n, n)*50
6 |
7 | start = time.time()
8 | b = np.dot(a, a)
9 | end = time.time()
10 |
11 | print(end-start)
--------------------------------------------------------------------------------
/MatrixGenerator/MatrixGenerator.cpp:
--------------------------------------------------------------------------------
1 | #define WIN32_LEAN_AND_MEAN
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 |
23 | #define AVX_ALIGN 32
24 |
25 | typedef struct Mat
26 | {
27 | unsigned width;
28 | unsigned height;
29 | unsigned rowSpan;
30 | float *mat;
31 | } Mat;
32 |
33 | template
34 | static void RandInitMat(Mat *m, Rand &r)
35 | {
36 | for(unsigned y=0; yheight; ++y)
37 | for(unsigned x=0; xwidth; ++x)
38 | m->mat[y*m->rowSpan + x] = r();
39 | }
40 |
41 | const Mat LoadMat(const char * const filename) {
42 | Mat mat;
43 | uint32_t matSize;
44 |
45 | std::ifstream in(filename, std::ios::binary | std::ios::in);
46 |
47 | if (!in.is_open()) {
48 | std::cerr << "Err loading!\n";
49 | return {};
50 | }
51 |
52 | in.read((char*)&mat, 3 * sizeof(uint32_t));
53 | in.read((char*)&matSize, sizeof(uint32_t));
54 | in.seekg(12*sizeof(uint32_t), std::ios::cur);
55 | mat.mat = (float*)malloc(matSize);
56 | in.read((char*)mat.mat, matSize);
57 |
58 | in.close();
59 |
60 | return mat;
61 | }
62 |
63 | static void DumpMat(const char *filename, const Mat &m)
64 | {
65 | uint32_t header[16];
66 | std::ofstream out(filename, std::ofstream::binary | std::ofstream::out);
67 |
68 | header[0] = m.width;
69 | header[1] = m.height;
70 | header[2] = m.rowSpan;
71 | header[3] = m.height * m.rowSpan * sizeof(float);
72 |
73 | out.write(reinterpret_cast(header), sizeof(header));
74 | out.write(reinterpret_cast(m.mat), header[3]);
75 |
76 | out.close();
77 | }
78 |
79 | static unsigned RoundUpPwr2(unsigned val, unsigned pwr2)
80 | {
81 | return (val + (pwr2 - 1)) & (~(pwr2 - 1));
82 | }
83 |
84 | /* This function prints the given matrix to given std::ostream */
85 | static void PrintMat(const Mat& mat, std::ostream& stream)
86 | {
87 | stream << "w, h, rS: " << mat.width << " " << mat.height << " " << mat.rowSpan
88 | << "\n";
89 | for (int i = 0; i < mat.height; i++) {
90 | for (int j = 0; j < mat.width; ++j) {
91 | stream << mat.mat[i * mat.rowSpan + j] << " ";
92 | }
93 | stream << "\n";
94 | }
95 | }
96 |
97 |
98 | /* Single threaded, do i need to multithread this as well?
99 | Honestly, I don't think it will have any significant effect. n^2 vs n^3 */
100 | __declspec(noalias) const Mat TransposeMat(const Mat& mat)
101 | {
102 | const unsigned tRowSpan = RoundUpPwr2(mat.height, 64 / sizeof(float));
103 | float* __restrict const tData =
104 | (float*)_aligned_malloc(mat.width * tRowSpan * sizeof(float), AVX_ALIGN);
105 |
106 | Mat T{ mat.height, mat.width, tRowSpan, tData };
107 |
108 | // hah, the loops are truly interchangable as we encounter a cache miss either ways
109 | for (int rowT = 0; rowT < T.height; ++rowT) {
110 | for (int colT = 0; colT < T.width; ++colT) {
111 | tData[rowT * tRowSpan + colT] = mat.mat[colT * mat.rowSpan + rowT];
112 | }
113 | }
114 |
115 | return T;
116 | }
117 |
118 | const Mat ST_TransposedBMatMul(const Mat& matA, const Mat& matB)
119 | {
120 | /* Now, I thought transposing B and then traversing it row order would help and it does!
121 | * Also, note that, if we manually unrolled the loop here, compiler wouldn't vectorize the loop for some reason
122 | * (1301: Loop stride is not +1.) is the exact compiler message. */
123 | float* __restrict const matData =
124 | (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
125 |
126 | Mat matC{ matB.width, matA.height, matB.rowSpan, matData };
127 |
128 | const Mat matBT = TransposeMat(matB);
129 | for (int rowC = 0; rowC < matA.height; ++rowC) {
130 | for (int colC = 0; colC < matB.width; ++colC) {
131 | float accumulate = 0;
132 | for (int pos = 0; pos < matA.width; ++pos) {
133 | accumulate += matA.mat[rowC * matA.rowSpan + pos] *
134 | matBT.mat[colC * matBT.rowSpan + pos];
135 | }
136 | matData[rowC * matB.rowSpan + colC] = accumulate;
137 | }
138 | }
139 |
140 | _aligned_free(matBT.mat);
141 |
142 | return matC;
143 | }
144 |
145 | int _cdecl main(int argc, char *argv[])
146 | {
147 | static const unsigned ALIGN = 64;
148 | static const unsigned FLT_ALIGN = ALIGN / sizeof(float);
149 |
150 | std::random_device rd;
151 | std::uniform_real_distribution matValDist(-50.0f, 50.0f);
152 | auto matRand = std::bind(matValDist, std::ref(rd));
153 | Mat a, b;
154 | std::string suffix;
155 |
156 | if (argc == 1) {
157 | /* randomly generated */
158 | std::uniform_int_distribution matSizeDist(100, 1000);
159 | auto sizeRand = std::bind(matSizeDist, std::ref(rd));
160 | a.width = sizeRand();
161 | a.height = sizeRand();
162 | a.rowSpan = RoundUpPwr2(a.width, FLT_ALIGN);
163 |
164 | b.width = sizeRand();
165 | b.height = a.width;
166 |
167 | suffix = "";
168 | }
169 | else if (argc == 2) {
170 | /* 2 NxN */
171 | const int N = atoi(argv[1]);
172 | assert(N > 0);
173 | a.width = N;
174 | a.height = N;
175 | b.width = N;
176 | b.height = N;
177 |
178 | suffix = "";
179 | }
180 | else if (argc == 3) {
181 | /* 2 NxN */
182 | const int N = atoi(argv[1]);
183 | assert(N > 0);
184 | a.width = N;
185 | a.height = N;
186 | b.width = N;
187 | b.height= N;
188 |
189 | suffix = std::string(argv[2]);
190 | }
191 | else if (argc == 4) {
192 | /* NxM, MxN */
193 | const int N = atoi(argv[1]);
194 | const int M = atoi(argv[2]);
195 | assert(N > 0 && M > 0);
196 | a.width = M;
197 | a.height = N;
198 | b.width = N;
199 | b.height = M;
200 |
201 | suffix = std::string(argv[3]);
202 | }
203 | else if (argc == 5) {
204 | /* NxM, MxK */
205 | const int N = atoi(argv[1]);
206 | const int M = atoi(argv[2]);
207 | const int K = atoi(argv[3]);
208 | assert(N > 0 && M > 0);
209 | a.width = M;
210 | a.height = N;
211 | b.width = K;
212 | b.height = M;
213 |
214 | suffix = std::string(argv[4]);
215 | }
216 | else {
217 | std::cerr << "Invalid arguments!\n";
218 | return 2;
219 | }
220 |
221 |
222 | a.rowSpan = RoundUpPwr2(a.width, FLT_ALIGN);
223 | b.rowSpan = RoundUpPwr2(b.width, FLT_ALIGN);
224 |
225 | a.mat = new float[a.rowSpan*a.height];
226 | b.mat = new float[b.rowSpan*b.height];
227 |
228 | RandInitMat(&a, matRand);
229 | RandInitMat(&b, matRand);
230 |
231 | printf("a: [%d %d] | b: [%d %d]\n", a.width, a.height, b.width, b.height);
232 |
233 | auto start = std::chrono::high_resolution_clock::now();
234 | const Mat c = ST_TransposedBMatMul(a, b);
235 | auto end = std::chrono::high_resolution_clock::now();
236 | std::cout << "Generation w/ tranposed mult. took: "
237 | << std::chrono::duration_cast(end - start).count()
238 | << " microseconds.\n";
239 |
240 | DumpMat(("matrixA" + suffix + ".bin").c_str(), a);
241 | DumpMat(("matrixB" + suffix + ".bin").c_str(), b);
242 | DumpMat(("matrixAB" + suffix + ".bin").c_str(), c);
243 |
244 | delete[] a.mat;
245 | delete[] b.mat;
246 | _aligned_free(c.mat);
247 |
248 | return 0;
249 | }
250 |
--------------------------------------------------------------------------------
/MatrixGenerator/MatrixGenerator.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 | Debug
14 | x64
15 |
16 |
17 | Release
18 | x64
19 |
20 |
21 |
22 |
23 |
24 |
25 | 15.0
26 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}
27 | MatrixGenerator
28 | 10.0.16299.0
29 |
30 |
31 |
32 | Application
33 | true
34 | v141
35 | MultiByte
36 |
37 |
38 | Application
39 | false
40 | v141
41 | true
42 | MultiByte
43 |
44 |
45 | Application
46 | true
47 | v141
48 | MultiByte
49 |
50 |
51 | Application
52 | false
53 | v141
54 | true
55 | MultiByte
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 | Level3
79 | Disabled
80 | true
81 | true
82 |
83 |
84 | Console
85 |
86 |
87 |
88 |
89 | Level3
90 | Disabled
91 | true
92 | true
93 |
94 |
95 |
96 |
97 | Level3
98 | MaxSpeed
99 | true
100 | true
101 | true
102 | true
103 |
104 |
105 | true
106 | true
107 |
108 |
109 |
110 |
111 | Level3
112 | MaxSpeed
113 | true
114 | true
115 | true
116 | true
117 | Speed
118 | AdvancedVectorExtensions2
119 | Fast
120 | false
121 | false
122 | true
123 | false
124 | false
125 |
126 |
127 | true
128 | true
129 |
130 |
131 |
132 |
133 |
134 |
--------------------------------------------------------------------------------
/MatrixGenerator/MatrixGenerator.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 |
--------------------------------------------------------------------------------
/MatrixMult.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.27428.2015
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MatrixMult", "MatrixMult\MatrixMult.vcxproj", "{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}"
7 | EndProject
8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MatrixMulTester", "MatrixMulTester\MatrixMulTester.vcxproj", "{0417B0D4-F0BF-4218-945C-C139C9498728}"
9 | EndProject
10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MatrixGenerator", "MatrixGenerator\MatrixGenerator.vcxproj", "{C6A23610-8F92-418E-8BC6-2CEFA194CE78}"
11 | EndProject
12 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Benchmarks", "Benchmarks\Benchmarks.vcxproj", "{5895928A-FD77-4426-9588-36399A75D082}"
13 | EndProject
14 | Global
15 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
16 | Debug|x64 = Debug|x64
17 | Debug|x86 = Debug|x86
18 | Release|x64 = Release|x64
19 | Release|x86 = Release|x86
20 | EndGlobalSection
21 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
22 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Debug|x64.ActiveCfg = Debug|x64
23 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Debug|x64.Build.0 = Debug|x64
24 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Debug|x86.ActiveCfg = Debug|Win32
25 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Debug|x86.Build.0 = Debug|Win32
26 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Release|x64.ActiveCfg = Release|x64
27 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Release|x64.Build.0 = Release|x64
28 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Release|x86.ActiveCfg = Release|Win32
29 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Release|x86.Build.0 = Release|Win32
30 | {0417B0D4-F0BF-4218-945C-C139C9498728}.Debug|x64.ActiveCfg = Debug|x64
31 | {0417B0D4-F0BF-4218-945C-C139C9498728}.Debug|x64.Build.0 = Debug|x64
32 | {0417B0D4-F0BF-4218-945C-C139C9498728}.Debug|x86.ActiveCfg = Debug|Win32
33 | {0417B0D4-F0BF-4218-945C-C139C9498728}.Debug|x86.Build.0 = Debug|Win32
34 | {0417B0D4-F0BF-4218-945C-C139C9498728}.Release|x64.ActiveCfg = Release|x64
35 | {0417B0D4-F0BF-4218-945C-C139C9498728}.Release|x64.Build.0 = Release|x64
36 | {0417B0D4-F0BF-4218-945C-C139C9498728}.Release|x86.ActiveCfg = Release|Win32
37 | {0417B0D4-F0BF-4218-945C-C139C9498728}.Release|x86.Build.0 = Release|Win32
38 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Debug|x64.ActiveCfg = Debug|x64
39 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Debug|x64.Build.0 = Debug|x64
40 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Debug|x86.ActiveCfg = Debug|Win32
41 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Debug|x86.Build.0 = Debug|Win32
42 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Release|x64.ActiveCfg = Release|x64
43 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Release|x64.Build.0 = Release|x64
44 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Release|x86.ActiveCfg = Release|Win32
45 | {C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Release|x86.Build.0 = Release|Win32
46 | {5895928A-FD77-4426-9588-36399A75D082}.Debug|x64.ActiveCfg = Debug|x64
47 | {5895928A-FD77-4426-9588-36399A75D082}.Debug|x64.Build.0 = Debug|x64
48 | {5895928A-FD77-4426-9588-36399A75D082}.Debug|x86.ActiveCfg = Debug|Win32
49 | {5895928A-FD77-4426-9588-36399A75D082}.Debug|x86.Build.0 = Debug|Win32
50 | {5895928A-FD77-4426-9588-36399A75D082}.Release|x64.ActiveCfg = Release|x64
51 | {5895928A-FD77-4426-9588-36399A75D082}.Release|x64.Build.0 = Release|x64
52 | {5895928A-FD77-4426-9588-36399A75D082}.Release|x86.ActiveCfg = Release|Win32
53 | {5895928A-FD77-4426-9588-36399A75D082}.Release|x86.Build.0 = Release|Win32
54 | EndGlobalSection
55 | GlobalSection(SolutionProperties) = preSolution
56 | HideSolutionNode = FALSE
57 | EndGlobalSection
58 | GlobalSection(ExtensibilityGlobals) = postSolution
59 | SolutionGuid = {D568E00C-A8ED-41CB-B719-B116D29D421F}
60 | EndGlobalSection
61 | GlobalSection(Performance) = preSolution
62 | HasPerformanceSessions = true
63 | EndGlobalSection
64 | EndGlobal
65 |
--------------------------------------------------------------------------------
/MatrixMult/CPUUtil.cpp:
--------------------------------------------------------------------------------
1 | #include "CPUUtil.h"
2 | #include
3 | #include
4 |
5 | namespace CPUUtil
6 | {
7 | namespace
8 | {
9 | static int logicalProcInfoCached = 0;
10 | static unsigned numHWCores, numLogicalProcessors;
11 | static ULONG_PTR* physLogicalProcessorMap = NULL;
12 |
13 | void PrintSysLPInfoArr(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION* const sysLPInf,
14 | const DWORD& retLen)
15 | {
16 | unsigned numPhysicalCores = 0;
17 | for (int i = 0; i * sizeof(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= retLen;
18 | ++i) {
19 | if (sysLPInf[i].Relationship != RelationProcessorCore)
20 | continue;
21 |
22 | printf(
23 | "PHYSICAL CPU[%d]\n\t_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX:\n",
24 | numPhysicalCores);
25 | printf("\t\tProcessorMask:%s\n",
26 | BitmaskToStr(sysLPInf[i].ProcessorMask));
27 | printf("\t\tRelationship:%u | RelationProcessorCore\n",
28 | (uint8_t)sysLPInf[i].Relationship);
29 | printf("\t\tProcessorCore:\n");
30 | printf("\t\t\tFlags(HT?):%d\n",
31 | (uint8_t)sysLPInf[i].ProcessorCore.Flags);
32 | ++numPhysicalCores;
33 | }
34 | }
35 |
36 | int TestPrintCPUCores()
37 | {
38 | const unsigned N = 30;
39 | _SYSTEM_LOGICAL_PROCESSOR_INFORMATION sysLPInf[N];
40 | DWORD retLen = N * sizeof(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
41 | LOGICAL_PROCESSOR_RELATIONSHIP lpRel = RelationProcessorCore;
42 |
43 | BOOL retCode = GetLogicalProcessorInformation(&sysLPInf[0], &retLen);
44 |
45 | if (!retCode) {
46 | DWORD errCode = GetLastError();
47 | printf("ERR: %d\n", errCode);
48 | if (errCode == ERROR_INSUFFICIENT_BUFFER) {
49 | printf("Buffer is not large enough! Buffer length required: %d\n",
50 | retLen);
51 | } else {
52 | printf("CHECK MSDN SYSTEM ERROR CODES LIST.\n");
53 | }
54 | return errCode;
55 | }
56 |
57 | PrintSysLPInfoArr(sysLPInf, retLen);
58 |
59 | return 0;
60 | }
61 |
62 | template
63 | int NumSetBits(T n) {
64 | int count = 0;
65 | while (n) {
66 | count += (n & 1) > 0 ? 1 : 0;
67 | n >>= 1;
68 | }
69 | return count;
70 | }
71 |
72 | DWORD _GetSysLPMap(unsigned& numHWCores)
73 | {
74 | // These assumptions should never fail on desktop
75 | const unsigned N = 48, M = 48;
76 |
77 | _SYSTEM_LOGICAL_PROCESSOR_INFORMATION sysLPInf[N];
78 | DWORD retLen = N * sizeof(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
79 | LOGICAL_PROCESSOR_RELATIONSHIP lpRel = RelationProcessorCore;
80 |
81 | static BOOL retCode = GetLogicalProcessorInformation(&sysLPInf[0], &retLen);
82 |
83 | if (!retCode) {
84 | return GetLastError();
85 | }
86 |
87 | ULONG_PTR* const lMap = (ULONG_PTR*)malloc(M * sizeof(ULONG_PTR));
88 |
89 | numHWCores = 0;
90 | for (int i = 0; i * sizeof(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= retLen;
91 | ++i) {
92 | if (sysLPInf[i].Relationship != RelationProcessorCore)
93 | continue;
94 |
95 | ULONG_PTR logicalProcessorMask = sysLPInf[i].ProcessorMask;
96 | lMap[numHWCores++] = logicalProcessorMask;
97 | numLogicalProcessors += NumSetBits(logicalProcessorMask);
98 | }
99 |
100 | physLogicalProcessorMap = (ULONG_PTR*)malloc(numHWCores * sizeof(ULONG_PTR));
101 | memcpy(physLogicalProcessorMap, lMap, numHWCores * sizeof(ULONG_PTR));
102 | free(lMap);
103 |
104 | return 0;
105 | }
106 | } // private namespace
107 |
108 | const char* BitmaskToStr(WORD bitmask)
109 | {
110 | const unsigned N = sizeof(WORD) * 8;
111 | char* const str = new char[N + 1];
112 | str[N] = 0;
113 | for (int i = 0; i < N; ++i) {
114 | str[N - i - 1] = '0' + ((bitmask)&1);
115 | bitmask >>= 1;
116 | }
117 | return str;
118 | }
119 |
120 | int GetNumHWCores()
121 | {
122 | if (!logicalProcInfoCached) {
123 | DWORD retCode = _GetSysLPMap(numHWCores);
124 | if (!retCode)
125 | logicalProcInfoCached = 1;
126 | else
127 | return -1;
128 | }
129 | return numHWCores;
130 | }
131 |
132 | int GetNumLogicalProcessors() {
133 | if (!logicalProcInfoCached) {
134 | DWORD retCode = _GetSysLPMap(numHWCores);
135 | if (!retCode)
136 | logicalProcInfoCached = 1;
137 | else
138 | return -1;
139 | }
140 | return numLogicalProcessors;
141 | }
142 |
143 | int GetProcessorMask(unsigned n, ULONG_PTR& mask)
144 | {
145 | if (!logicalProcInfoCached) {
146 | DWORD retCode = _GetSysLPMap(numHWCores);
147 | if (!retCode)
148 | logicalProcInfoCached = 1;
149 | else
150 | return retCode;
151 | }
152 |
153 | if (n >= numHWCores)
154 | return -1;
155 |
156 | mask = physLogicalProcessorMap[n];
157 |
158 | return 0;
159 | }
160 |
161 | /* Returns decimal value for a 32 bit mask at compile time, [i:j] set to 1, rest are 0. */
162 | constexpr int GenerateMask(int i, int j)
163 | {
164 | if (i > j)
165 | return (1 << (i + 1)) - (1 << j);
166 | else
167 | return (1 << (j + 1)) - (1 << i);
168 | }
169 |
170 | void GetCacheInfo(int* dCaches, int& iCache)
171 | {
172 | /*
173 | * From Intel's Processor Identification CPUID Instruction Notes:
174 | * EAX := 0x04, ECX := (0, 1, 2 .. until EAX[4:0]==0)
175 | * cpuid(memaddr, n, k) sets eax to n, ecx to k,
176 | * writes EAX, EBX, ECX, and EDX to memaddr[0:4] respectively.
177 | * Cache size in bytes = (Ways + 1) * (Partitions + 1)
178 | * * (Line size + 1) * (Sets + 1)
179 | * = (EBX[31:22]+1) * (EBX[21:12]+1)
180 | * * (EBX[11:0]+1) * (ECX+1)
181 | * For now, this function assumes we're on a modern Intel CPU
182 | * So we have L1,2,3 data caches and first level instruction cache
183 | */
184 |
185 | int cpui[4];
186 |
187 | for (int i = 0, dc = 0; i < 4; ++i) {
188 | __cpuidex(cpui, 4, i);
189 | int sz = (((cpui[1] & GenerateMask(31, 22)) >> 22) + 1) *
190 | (((cpui[1] & GenerateMask(21, 12)) >> 12) + 1) *
191 | ((cpui[1] & GenerateMask(11, 0)) + 1) * (cpui[2] + 1);
192 | int cacheType = (cpui[0] & 31);
193 | if (cacheType == 1 || cacheType == 3) {
194 | dCaches[dc++] = sz;
195 | } else if (cacheType == 2) {
196 | iCache = sz;
197 | }
198 | }
199 | }
200 |
201 | int GetCacheLineSize()
202 | {
203 | /*
204 | * From Intel's Processor Identification CPUID Instruction Notes:
205 | * Executing CPUID with EAX=1, fills EAX, EBX, ECX, EDX
206 | * EBX[15:8] : CLFLUSHSIZE, val*8 = cache line size
207 | */
208 | int cpui[4];
209 | __cpuid(cpui, 1);
210 | return (cpui[1] & GenerateMask(15, 8)) >> (8 - 3);
211 | }
212 |
213 | int GetHTTStatus() {
214 | int cpui[4];
215 | __cpuid(cpui, 1);
216 | return ((cpui[3] & (1<<28)) >> 28) ? 1 : 0;
217 | }
218 |
219 | int GetSIMDSupport() {
220 | int cpui[4];
221 | __cpuid(cpui, 1);
222 | int fma = (cpui[2] & (1 << 12)) >> 12;
223 | int avx = (cpui[2] & (1 << 28)) >> 28;
224 | return fma & avx;
225 | }
226 |
227 | }; // namespace CPUUtil
228 |
--------------------------------------------------------------------------------
/MatrixMult/CPUUtil.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #define WIN32_LEAN_AND_MEAN
3 | #include
4 | #include
5 | #include
6 |
7 | namespace CPUUtil
8 | {
9 | /* Utility, convert given bitmask to const char* */
10 | const char* BitmaskToStr(WORD bitmask);
11 |
12 | /* Get number of physical processors on the runtime system */
13 | int GetNumHWCores();
14 |
15 | /* Get number of logical processors on the runtime system */
16 | int GetNumLogicalProcessors();
17 |
18 | /* Get the logical processor mask corresponding to the Nth hardware core */
19 | int GetProcessorMask(unsigned n, ULONG_PTR& mask);
20 |
21 | /* Fill dCaches with L1,2,3 data cache sizes,
22 | * and iCache with L1 dedicated instruction cache size. */
23 | void GetCacheInfo(int* dCaches, int& iCache);
24 |
25 | /* Query cache line size on the current system. */
26 | int GetCacheLineSize();
27 |
28 | /* Query whether or not the runtime system supports HTT */
29 | int GetHTTStatus();
30 |
31 | /* Query if the runtime system supports AVX and FMA instruction sets. */
32 | int GetSIMDSupport();
33 |
34 | }; // namespace CPUUtil
35 |
--------------------------------------------------------------------------------
/MatrixMult/MatrixMul.cpp:
--------------------------------------------------------------------------------
1 | #define WIN32_LEAN_AND_MEAN
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include "ThreadPool.h"
19 |
20 | /* Define for AVX alignment requirements */
21 | #define AVX_ALIGN 32
22 |
23 | /* Define CPU related variables, actual values will be queried on runtime. */
24 | int CPUInfoQueried = 0;
25 | int L2Size = 256 * 1024;
26 | int L3Size = 12 * 1024 * 1024;
27 | int cacheLineSz = 64;
28 | int numHWCores = 6;
29 |
30 | /* Prefetching switches, if multiple MatMul operations are intended to run in parallel,
31 | * individual mutexes should be created for each one. */
32 | constexpr int doL3Prefetch = 0;
33 | constexpr int doL12Prefetch = 0;
34 | int prefetched[1024][1024];
35 | std::mutex prefetchMutex;
36 |
37 | /* Matrix structure */
38 | typedef struct Mat {
39 | unsigned width;
40 | unsigned height;
41 | unsigned rowSpan;
42 | /* guarantee that mat will not be aliased (__restrict),
43 | no need for two matrices to point at sama data */
44 | float* __restrict mat;
45 | } Mat;
46 |
47 | /*
48 | * This struct holds the information for multiple levels of block sizes.
49 | * It's used to keep function parameters short and readable
50 | * Constraints on block sizes:
51 | * L2BlockX % 3 == L2BlockY % 4 == 0,
52 | * L3BlockX % 2 == L3BlockY % 2 == 0,
53 | * (L3BlockX / 2) % L2BlockX == 0
54 | */
55 | typedef struct MMBlockInfo {
56 | const unsigned L3BlockX, L3BlockY;
57 | const unsigned L2BlockX, L2BlockY;
58 | const unsigned issuedBlockSzX, issuedBlockSzY;
59 | } MMBlockInfo;
60 |
61 | /* Load a previously saved matrix from disk */
62 | const Mat LoadMat(const char* const filename)
63 | {
64 | Mat mat;
65 | uint32_t matSize;
66 |
67 | std::ifstream in(filename, std::ios::binary | std::ios::in);
68 |
69 | if (!in.is_open()) {
70 | std::cout << "Err loading!\n";
71 | in.close();
72 | return {0, 0, 0, NULL};
73 | }
74 |
75 | in.read((char*)&mat, 3 * sizeof(uint32_t));
76 | in.read((char*)&matSize, sizeof(uint32_t));
77 | in.seekg(12 * sizeof(uint32_t), std::ios::cur);
78 | mat.mat = (float*)_aligned_malloc(matSize, AVX_ALIGN);
79 | in.read((char*)mat.mat, matSize);
80 |
81 | in.close();
82 |
83 | return mat;
84 | }
85 |
86 | /* Dump the given matrix to the disk. */
87 | static void DumpMat(const char* filename, const Mat& m)
88 | {
89 | uint32_t header[16];
90 | std::ofstream out(filename, std::ofstream::binary | std::ofstream::out);
91 |
92 | header[0] = m.width;
93 | header[1] = m.height;
94 | header[2] = m.rowSpan;
95 | header[3] = m.height * m.rowSpan * sizeof(float);
96 |
97 | out.write(reinterpret_cast(header), sizeof(header));
98 | out.write(reinterpret_cast(m.mat), header[3]);
99 |
100 | out.close();
101 | }
102 |
103 | /* Deallocate matrix data */
104 | void FreeMat(Mat& mat)
105 | {
106 | if (!mat.mat)
107 | return;
108 | _aligned_free(mat.mat);
109 | mat.mat = NULL;
110 | }
111 | void FreeMat(const Mat& mat)
112 | {
113 | if (!mat.mat)
114 | return;
115 | _aligned_free(mat.mat);
116 | }
117 |
118 | /* Round a given number to the nearest multiple of K,
119 | * where K is a parameter and is a power of 2 */
120 | static unsigned RoundUpPwr2(unsigned val, unsigned pwr2)
121 | {
122 | return (val + (pwr2 - 1)) & (~(pwr2 - 1));
123 | }
124 |
125 | /* Compute the transpose of a given matrix.
126 | * A singlethreaded implementation without block tiling. */
127 | __declspec(noalias) const Mat TransposeMat(const Mat& mat)
128 | {
129 | const unsigned tRowSpan = RoundUpPwr2(mat.height, 64 / sizeof(float));
130 | float* __restrict const tData =
131 | (float*)_aligned_malloc(mat.width * tRowSpan * sizeof(float), AVX_ALIGN);
132 |
133 | Mat T{mat.height, mat.width, tRowSpan, tData};
134 |
135 | // the loops are truly interchangable as we encounter a cache miss either ways
136 | for (int rowT = 0; rowT < T.height; ++rowT) {
137 | for (int colT = 0; colT < T.width; ++colT) {
138 | tData[rowT * tRowSpan + colT] = mat.mat[colT * mat.rowSpan + rowT];
139 | }
140 | }
141 |
142 | return T;
143 | }
144 |
145 | /* Print the given matrix to given std::ostream */
146 | static void PrintMat(const Mat& mat, std::ostream& stream)
147 | {
148 | stream << "w, h, rS: " << mat.width << " " << mat.height << " " << mat.rowSpan
149 | << "\n";
150 | for (int i = 0; i < mat.height; i++) {
151 | for (int j = 0; j < mat.width; ++j) {
152 | stream << mat.mat[i * mat.rowSpan + j] << " ";
153 | }
154 | stream << "\n";
155 | }
156 | }
157 |
158 | /**************** Naive, initial implementations ****************/
159 |
160 | /* Naive MatMul */
161 | const Mat ST_NaiveMatMul(const Mat& matA, const Mat& matB)
162 | {
163 | /* First : naive solution with but with some tricks to make compiler (MSVC) behave
164 | * Note that, in this case, manually unrolling the loop helps
165 | * as the compiler can't auto-vectorize non-contagious memory access */
166 | float* __restrict const matData =
167 | (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
168 |
169 | Mat matC{matB.width, matA.height, matB.rowSpan, matData};
170 |
171 | for (int rowC = 0; rowC < matA.height; ++rowC) {
172 | for (int colC = 0; colC < matB.width; ++colC) {
173 | /* an independent, local accumulator. */
174 | float accumulate = 0;
175 | int pos = 0;
176 | /* manual unrolling IS helpful in this case */
177 | for (; pos < matA.width - 4; pos += 4) {
178 | accumulate += matA.mat[rowC * matA.rowSpan + pos] *
179 | matB.mat[pos * matB.rowSpan + colC] +
180 | matA.mat[rowC * matA.rowSpan + pos + 1] *
181 | matB.mat[(pos + 1) * matB.rowSpan + colC] +
182 | matA.mat[rowC * matA.rowSpan + pos + 2] *
183 | matB.mat[(pos + 2) * matB.rowSpan + colC] +
184 | matA.mat[rowC * matA.rowSpan + pos + 3] *
185 | matB.mat[(pos + 3) * matB.rowSpan + colC];
186 | }
187 | for (; pos < matA.width; ++pos) {
188 | accumulate += matA.mat[rowC * matA.rowSpan + pos] *
189 | matB.mat[pos * matB.rowSpan + colC];
190 | }
191 | matData[rowC * matB.rowSpan + colC] = accumulate;
192 | }
193 | }
194 |
195 | return matC;
196 | }
197 |
198 | /* MatMul with transposed B for improved cache behavior. */
199 | const Mat ST_TransposedBMatMul(const Mat& matA, const Mat& matB)
200 | {
201 | /*
202 | * Now, transposing B and then traversing it row order seemed promising!
203 | * Also, note that, if we manually unrolled the loop here,
204 | * compiler wouldn't vectorize the loop,
205 | * so we keep it simple and let MSVC auto vectorize this.
206 | */
207 | float* __restrict const matData =
208 | (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
209 |
210 | Mat matC{matB.width, matA.height, matB.rowSpan, matData};
211 |
212 | const Mat matBT = TransposeMat(matB);
213 | for (int rowC = 0; rowC < matA.height; ++rowC) {
214 | for (int colC = 0; colC < matB.width; ++colC) {
215 | float accumulate = 0;
216 | for (int pos = 0; pos < matA.width; ++pos) {
217 | accumulate += matA.mat[rowC * matA.rowSpan + pos] *
218 | matBT.mat[colC * matBT.rowSpan + pos];
219 | }
220 | matData[rowC * matB.rowSpan + colC] = accumulate;
221 | }
222 | }
223 |
224 | _aligned_free(matBT.mat);
225 |
226 | return matC;
227 | }
228 |
229 | /*
230 | * MatMul with a different traversal order.
231 | * Instead of linearly running thru whole rows of output matrix C,
232 | * calculate blocks of a certain size at a time.
233 | */
234 | const Mat ST_BlockMult(const Mat& matA, const Mat& matB)
235 | {
236 | /* Now, once we fetch column col from B, we use these cached values
237 | * to populate C(row, col:col+8), Any more than that,
238 | * and we lose the old cached values. But notice that,
239 | * the C(row+1, col:col+8) uses the exact same columns.
240 | * So instead of traversing in row order, we could do blocks!
241 | * Notice that I'm using transposed B,
242 | * That's because MSVC refuses to vectorize the loop with
243 | * non-contagious memory access.
244 | * So even though the floats themselves will be in the cache,
245 | * we won't have SIMD, which kills the performance.
246 | *
247 | * Also, I had to assign offsets to temporary constants,
248 | * because otherwise MSVC can't auto-vectorize. */
249 | float* __restrict const matData =
250 | (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
251 |
252 | Mat matC{matB.width, matA.height, matB.rowSpan, matData};
253 |
254 | const unsigned blockX = 16, blockY = 16;
255 |
256 | const Mat matBT = TransposeMat(matB);
257 |
258 | int rowC = 0;
259 | for (; rowC < matA.height - blockY; rowC += blockY) {
260 | int colC = 0;
261 | for (; colC < matB.width - blockX; colC += blockX) {
262 | for (int blockRow = 0; blockRow < blockY; ++blockRow) {
263 | for (int blockCol = 0; blockCol < blockX; ++blockCol) {
264 | const unsigned r = rowC + blockRow;
265 | const unsigned c = colC + blockCol;
266 | const unsigned matAoffset = r * matA.rowSpan;
267 | const unsigned matBoffset = c * matBT.rowSpan;
268 |
269 | float accumulate = 0;
270 | for (int pos = 0; pos < matA.width; ++pos) {
271 | accumulate +=
272 | matA.mat[matAoffset + pos] * matBT.mat[matBoffset + pos];
273 | }
274 | matData[r * matB.rowSpan + c] = accumulate;
275 | }
276 | }
277 | }
278 | for (int blockRow = 0; blockRow < blockY; ++blockRow) {
279 | for (int c = colC; c < matB.width; ++c) {
280 | const unsigned r = rowC + blockRow;
281 | const unsigned matAoffset = r * matA.rowSpan;
282 | const unsigned matBoffset = c * matBT.rowSpan;
283 | float accumulate = 0;
284 | for (int pos = 0; pos < matA.width; ++pos) {
285 | accumulate +=
286 | matA.mat[matAoffset + pos] * matBT.mat[matBoffset + pos];
287 | }
288 | matData[r * matB.rowSpan + c] = accumulate;
289 | }
290 | }
291 | }
292 | for (; rowC < matA.height; ++rowC) {
293 | for (int colC = 0; colC < matB.width; ++colC) {
294 | const unsigned matAoffset = rowC * matA.rowSpan;
295 | const unsigned matBoffset = colC * matBT.rowSpan;
296 | float accumulate = 0;
297 | for (int pos = 0; pos < matA.width; ++pos) {
298 | accumulate += matA.mat[matAoffset + pos] * matBT.mat[matBoffset + pos];
299 | }
300 | matData[rowC * matB.rowSpan + colC] = accumulate;
301 | }
302 | }
303 |
304 | _aligned_free(matBT.mat);
305 |
306 | return matC;
307 | }
308 |
309 | /************** ~~Naive, initial implementations~~ **************/
310 |
311 | /* Declerations for helper functions for the final implementation */
312 |
313 | __declspec(noalias) void MMHelper_MultAnyBlocks(float* __restrict const matData,
314 | const unsigned rowSpan, const Mat& matA,
315 | const Mat& matBT, const unsigned colC,
316 | const unsigned rowC, const int blockX,
317 | const int blockY,
318 | const MMBlockInfo& mmBlockInfo);
319 |
320 | __declspec(noalias) void MMHelper_MultL2Blocks(float* __restrict const matData,
321 | const unsigned rowSpan, const Mat& matA,
322 | const Mat& matBT, const unsigned col,
323 | const unsigned row,
324 | const unsigned L2BlockX,
325 | const unsigned L2BlockY);
326 |
327 | __declspec(noalias) void MMHelper_MultFullBlocks(float* __restrict const matData,
328 | const unsigned rowSpan,
329 | const Mat& matA, const Mat& matBT,
330 | const unsigned colC,
331 | const unsigned rowC,
332 | const MMBlockInfo& mmBlockInfo);
333 |
334 | /* Declarations for helper functions that handle NxM blocks */
335 |
336 | __declspec(noalias) void MMHelper_Mult4x3Blocks(float* __restrict const matData,
337 | const unsigned rowSpan, const Mat& matA,
338 | const Mat& matBT, const unsigned col,
339 | const unsigned row);
340 | __declspec(noalias) void MMHelper_Mult4x1Blocks(float* __restrict const matData,
341 | const unsigned rowSpan, const Mat& matA,
342 | const Mat& matBT, const unsigned col,
343 | const unsigned row);
344 | __declspec(noalias) void MMHelper_Mult1x3Blocks(float* __restrict const matData,
345 | const unsigned rowSpan, const Mat& matA,
346 | const Mat& matBT, const unsigned col,
347 | const unsigned row);
348 | __declspec(noalias) void MMHelper_Mult1x1Blocks(float* __restrict const matData,
349 | const unsigned rowSpan, const Mat& matA,
350 | const Mat& matBT, const unsigned col,
351 | const unsigned row);
352 |
353 | /*
354 | * Helper function for computing a block out of the output matrix C.
355 | * This function is used for the residues at the edges
356 | * after the majority of the matrix is computed as KxK sized blocks.
357 | * (t,l,b,r)->(row, col, row+blockY, col+blockX).
358 | */
359 | __declspec(noalias) void MMHelper_MultAnyBlocks(float* __restrict const matData,
360 | const unsigned rowSpan, const Mat& matA,
361 | const Mat& matBT, const unsigned colC,
362 | const unsigned rowC, const int blockX,
363 | const int blockY,
364 | const MMBlockInfo& mmBlockInfo)
365 | {
366 | /* if no work to be done, exit */
367 | if (blockX <= 0 || blockY <= 0)
368 | return;
369 |
370 | /* shorthand for some parameters */
371 | const unsigned L2BlockX = mmBlockInfo.L2BlockX, L2BlockY = mmBlockInfo.L2BlockY,
372 | L3BlockX = mmBlockInfo.L3BlockX, L3BlockY = mmBlockInfo.L3BlockY;
373 |
374 | int blockRowC = rowC;
375 | /* handle full L2Y sized rows */
376 | for (; blockRowC <= rowC + blockY - L2BlockY; blockRowC += L2BlockY) {
377 | int blockColC = colC;
378 | /* handle (L2X x L2Y) blocks */
379 | for (; blockColC <= colC + blockX - L2BlockX; blockColC += L2BlockX) {
380 | MMHelper_MultL2Blocks(matData, rowSpan, matA, matBT, blockColC, blockRowC,
381 | L2BlockX, L2BlockY);
382 | }
383 | /* handle the remaining columns, (w 4) {
387 | for (; blockCol <= colC + blockX - 3; blockCol += 3) {
388 | MMHelper_Mult4x3Blocks(matData, rowSpan, matA, matBT, blockCol,
389 | blockRow);
390 | }
391 | }
392 | for (; blockCol < colC + blockX; ++blockCol) {
393 | MMHelper_Mult4x1Blocks(matData, rowSpan, matA, matBT, blockCol,
394 | blockRow);
395 | }
396 | }
397 | }
398 | /* handle rest of the rows, h
450 | * [---- [a1] [a2] ---- ]
451 | * [---- [b1] [b2] ---- ]
452 | */
453 |
454 | for (int pos = 0; pos < matA.width; pos += 16) {
455 | a1 = _mm256_load_ps(&matA.mat[matAoffset + pos]);
456 | a2 = _mm256_load_ps(&matA.mat[matAoffset + pos + 8]);
457 |
458 | b1 = _mm256_load_ps(&matBT.mat[matBToffset + pos]);
459 | b2 = _mm256_load_ps(&matBT.mat[matBToffset + pos + 8]);
460 |
461 | c1 = _mm256_fmadd_ps(a1, b1, c1);
462 | c2 = _mm256_fmadd_ps(a2, b2, c2);
463 | }
464 |
465 | c1 = _mm256_add_ps(c1, c2);
466 | _mm256_store_ps(&fps[0], c1);
467 |
468 | accumulate = 0;
469 | for (int i = 0; i < 8; ++i) {
470 | accumulate += fps[i];
471 | }
472 |
473 | /* store */
474 | matData[row * rowSpan + col] = accumulate;
475 | }
476 |
477 | /* Calculates a 1x3 block on the matrix C, (t,l,b,r)->(row,col,row+1,col+3) */
478 | __declspec(noalias) void MMHelper_Mult1x3Blocks(float* __restrict const matData,
479 | const unsigned rowSpan, const Mat& matA,
480 | const Mat& matBT, const unsigned col,
481 | const unsigned row)
482 | {
483 | /* set up scalar array and accumulators for doing the horizontal sum (__m256 -> f32)
484 | * and storing its value. Horizontal sum is auto-vectorized by the compiler anyways. */
485 | __declspec(align(32)) float fps[8 * 3];
486 | __declspec(align(32)) float accumulate[3];
487 |
488 | /* we will be reusing these */
489 | const unsigned matAoffset = row * matA.rowSpan;
490 | const unsigned matBToffset1 = (col + 0) * matBT.rowSpan,
491 | matBToffset2 = (col + 1) * matBT.rowSpan,
492 | matBToffset3 = (col + 2) * matBT.rowSpan;
493 |
494 | /* set up accumulators */
495 | __m256 a1, b1, b2, b3;
496 | __m256 c1 = _mm256_setzero_ps();
497 | __m256 c2 = _mm256_setzero_ps();
498 | __m256 c3 = _mm256_setzero_ps();
499 |
500 | for (int pos = 0; pos < matA.width; pos += 8) {
501 | a1 = _mm256_load_ps(&matA.mat[matAoffset + pos]);
502 |
503 | b1 = _mm256_load_ps(&matBT.mat[matBToffset1 + pos]);
504 | b2 = _mm256_load_ps(&matBT.mat[matBToffset2 + pos]);
505 | b3 = _mm256_load_ps(&matBT.mat[matBToffset3 + pos]);
506 |
507 | c1 = _mm256_fmadd_ps(a1, b1, c1);
508 | c2 = _mm256_fmadd_ps(a1, b2, c2);
509 | c3 = _mm256_fmadd_ps(a1, b3, c3);
510 | }
511 |
512 | /* horizontal sum */
513 |
514 | memset(&accumulate[0], 0, 3 * sizeof(float));
515 |
516 | _mm256_store_ps(&fps[0], c1);
517 | _mm256_store_ps(&fps[8], c2);
518 | _mm256_store_ps(&fps[16], c3);
519 |
520 | /* autovectorized */
521 | for (int i = 0; i < 3; ++i) {
522 | for (int j = 0; j < 8; ++j) {
523 | accumulate[i] += fps[i * 8 + j];
524 | }
525 | }
526 |
527 | /* stores */
528 | matData[row * rowSpan + col + 0] = accumulate[0];
529 | matData[row * rowSpan + col + 1] = accumulate[1];
530 | matData[row * rowSpan + col + 2] = accumulate[2];
531 | }
532 |
533 | /* Calculates a 4x1 block on output matrix C. (t,l,b,r)->(row,col,row+4,col+1) */
534 | __declspec(noalias) void MMHelper_Mult4x1Blocks(float* __restrict const matData,
535 | const unsigned rowSpan, const Mat& matA,
536 | const Mat& matBT, const unsigned col,
537 | const unsigned row)
538 | {
539 | /* set up scalar array and accumulators for doing the horizontal sum (__m256 -> f32)
540 | * and storing its value. Horizontal sum is auto-vectorized by the compiler anyways. */
541 | __declspec(align(32)) float fps[8 * 12];
542 | __declspec(align(32)) float accumulate[8 * 12];
543 |
544 | const unsigned matAoffset1 = (row + 0) * matA.rowSpan,
545 | matAoffset2 = (row + 1) * matA.rowSpan,
546 | matAoffset3 = (row + 2) * matA.rowSpan,
547 | matAoffset4 = (row + 3) * matA.rowSpan;
548 |
549 | const unsigned matBToffset = col * matBT.rowSpan;
550 |
551 | /* set up accumulators */
552 | __m256 a11, a12, a21, a22, a31, a32, a41, a42, b1, b2;
553 | __m256 c1 = _mm256_setzero_ps();
554 | __m256 c2 = _mm256_setzero_ps();
555 | __m256 c3 = _mm256_setzero_ps();
556 | __m256 c4 = _mm256_setzero_ps();
557 | __m256 c5 = _mm256_setzero_ps();
558 | __m256 c6 = _mm256_setzero_ps();
559 | __m256 c7 = _mm256_setzero_ps();
560 | __m256 c8 = _mm256_setzero_ps();
561 |
562 | for (int pos = 0; pos < matA.width; pos += 16) {
563 | a11 = _mm256_load_ps(&matA.mat[matAoffset1 + pos]);
564 | a12 = _mm256_load_ps(&matA.mat[matAoffset1 + pos + 8]);
565 |
566 | a21 = _mm256_load_ps(&matA.mat[matAoffset2 + pos]);
567 | a22 = _mm256_load_ps(&matA.mat[matAoffset2 + pos + 8]);
568 |
569 | a31 = _mm256_load_ps(&matA.mat[matAoffset3 + pos]);
570 | a32 = _mm256_load_ps(&matA.mat[matAoffset3 + pos + 8]);
571 |
572 | a41 = _mm256_load_ps(&matA.mat[matAoffset4 + pos]);
573 | a42 = _mm256_load_ps(&matA.mat[matAoffset4 + pos + 8]);
574 |
575 | b1 = _mm256_load_ps(&matBT.mat[matBToffset + pos]);
576 | b2 = _mm256_load_ps(&matBT.mat[matBToffset + pos + 8]);
577 |
578 | c1 = _mm256_fmadd_ps(a11, b1, c1);
579 | c2 = _mm256_fmadd_ps(a21, b1, c2);
580 | c3 = _mm256_fmadd_ps(a31, b1, c3);
581 | c4 = _mm256_fmadd_ps(a41, b1, c4);
582 |
583 | c5 = _mm256_fmadd_ps(a12, b2, c5);
584 | c6 = _mm256_fmadd_ps(a22, b2, c6);
585 | c7 = _mm256_fmadd_ps(a32, b2, c7);
586 | c8 = _mm256_fmadd_ps(a42, b2, c8);
587 | }
588 |
589 | /* horizontal sum */
590 |
591 | memset(&accumulate[0], 0, 4 * sizeof(float));
592 |
593 | c1 = _mm256_add_ps(c1, c5);
594 | c2 = _mm256_add_ps(c2, c6);
595 | c3 = _mm256_add_ps(c3, c7);
596 | c4 = _mm256_add_ps(c4, c8);
597 |
598 | _mm256_store_ps(&fps[0], c1);
599 | _mm256_store_ps(&fps[8], c2);
600 | _mm256_store_ps(&fps[16], c3);
601 | _mm256_store_ps(&fps[24], c4);
602 |
603 | /* autovectorized */
604 | for (int i = 0; i < 4; ++i) {
605 | for (int j = 0; j < 8; ++j) {
606 | accumulate[i] += fps[i * 8 + j];
607 | }
608 | }
609 |
610 | /* stores */
611 | matData[(row + 0) * rowSpan + col] = accumulate[0];
612 | matData[(row + 1) * rowSpan + col] = accumulate[1];
613 | matData[(row + 2) * rowSpan + col] = accumulate[2];
614 | matData[(row + 3) * rowSpan + col] = accumulate[3];
615 | }
616 |
617 | /* Calculates a 4x3 block on output matrix C. (t,l,b,r)->(row,col,row+4,col+3) */
618 | __declspec(noalias) void MMHelper_Mult4x3Blocks(float* __restrict const matData,
619 | const unsigned rowSpan, const Mat& matA,
620 | const Mat& matBT, const unsigned col,
621 | const unsigned row)
622 | {
623 | /* aligned placeholders and accumulators */
624 | __declspec(align(32)) float fps[8 * 12];
625 | __declspec(align(32)) float accumulate[12];
626 |
627 | const unsigned matAoffset1 = (row + 0) * matA.rowSpan,
628 | matAoffset2 = (row + 1) * matA.rowSpan,
629 | matAoffset3 = (row + 2) * matA.rowSpan,
630 | matAoffset4 = (row + 3) * matA.rowSpan,
631 | matBToffset1 = (col + 0) * matBT.rowSpan,
632 | matBToffset2 = (col + 1) * matBT.rowSpan,
633 | matBToffset3 = (col + 2) * matBT.rowSpan;
634 |
635 | /*
636 | * <-----A.w----> <-----A.w---->
637 | * [----[a1]----] [----[b1]----]
638 | * [----[a2]----] [----[b2]----]
639 | * [----[a3]----] [----[b3]----]
640 | * [----[a4]----] ^col
641 | * ^ row
642 | *
643 | * we are now computing dot product of 3 rows and 3 columns
644 | * at the same time, 1x8f vectors at a time.
645 | *
646 | * 3 ymm registers for b1:3,
647 | * 4*3 = 12 registers for the accumulators
648 | * 1 register for the temporary ai value loaded.
649 | * All 16 registers are used.
650 | * High arithmetic density: 7 loads -> 12 fma instructions
651 | *
652 | */
653 |
654 | /* set up SIMD variables */
655 | __m256 a, b1, b2, b3;
656 | __m256 c1 = _mm256_setzero_ps();
657 | __m256 c2 = _mm256_setzero_ps();
658 | __m256 c3 = _mm256_setzero_ps();
659 | __m256 c4 = _mm256_setzero_ps();
660 | __m256 c5 = _mm256_setzero_ps();
661 | __m256 c6 = _mm256_setzero_ps();
662 | __m256 c7 = _mm256_setzero_ps();
663 | __m256 c8 = _mm256_setzero_ps();
664 | __m256 c9 = _mm256_setzero_ps();
665 | __m256 c10 = _mm256_setzero_ps();
666 | __m256 c11 = _mm256_setzero_ps();
667 | __m256 c12 = _mm256_setzero_ps();
668 |
669 | /* if prefetch switch is set,
670 | * prefetch first sections, one cache line at a time */
671 | if constexpr (doL12Prefetch) {
672 | _mm_prefetch((const char*)&matA.mat[matAoffset1], _MM_HINT_T0);
673 | _mm_prefetch((const char*)&matA.mat[matAoffset2], _MM_HINT_T0);
674 | _mm_prefetch((const char*)&matA.mat[matAoffset3], _MM_HINT_T0);
675 | _mm_prefetch((const char*)&matA.mat[matAoffset4], _MM_HINT_T0);
676 |
677 | _mm_prefetch((const char*)&matBT.mat[matBToffset1], _MM_HINT_T0);
678 | _mm_prefetch((const char*)&matBT.mat[matBToffset2], _MM_HINT_T0);
679 | _mm_prefetch((const char*)&matBT.mat[matBToffset3], _MM_HINT_T0);
680 | }
681 |
682 | /* do the dot products */
683 | for (int pos = 0; pos < matA.width; pos += 8) {
684 | if constexpr (doL12Prefetch) {
685 | if ((pos & (unsigned)15)) {
686 | _mm_prefetch((const char*)&matA.mat[matAoffset1 + pos + 8],
687 | _MM_HINT_T0);
688 | }
689 | }
690 |
691 | b1 = _mm256_load_ps(&matBT.mat[matBToffset1 + pos]);
692 | b2 = _mm256_load_ps(&matBT.mat[matBToffset2 + pos]);
693 | b3 = _mm256_load_ps(&matBT.mat[matBToffset3 + pos]);
694 |
695 | if constexpr (doL12Prefetch) {
696 | if ((pos & (unsigned)15)) {
697 | _mm_prefetch((const char*)&matA.mat[matAoffset2 + pos + 8],
698 | _MM_HINT_T0);
699 | }
700 | }
701 |
702 | a = _mm256_load_ps(&matA.mat[matAoffset1 + pos]);
703 | c1 = _mm256_fmadd_ps(a, b1, c1);
704 | c2 = _mm256_fmadd_ps(a, b2, c2);
705 | c3 = _mm256_fmadd_ps(a, b3, c3);
706 |
707 | if constexpr (doL12Prefetch) {
708 | if ((pos & (unsigned)15)) {
709 | _mm_prefetch((const char*)&matA.mat[matAoffset3 + pos + 8],
710 | _MM_HINT_T0);
711 | }
712 | }
713 | a = _mm256_load_ps(&matA.mat[matAoffset2 + pos]);
714 | c4 = _mm256_fmadd_ps(a, b1, c4);
715 | c5 = _mm256_fmadd_ps(a, b2, c5);
716 | c6 = _mm256_fmadd_ps(a, b3, c6);
717 |
718 | if constexpr (doL12Prefetch) {
719 | if ((pos & (unsigned)15)) {
720 | _mm_prefetch((const char*)&matA.mat[matAoffset4 + pos + 8],
721 | _MM_HINT_T0);
722 | }
723 | }
724 |
725 | a = _mm256_load_ps(&matA.mat[matAoffset3 + pos]);
726 | c7 = _mm256_fmadd_ps(a, b1, c7);
727 | c8 = _mm256_fmadd_ps(a, b2, c8);
728 | c9 = _mm256_fmadd_ps(a, b3, c9);
729 |
730 | if constexpr (doL12Prefetch) {
731 | if ((pos & (unsigned)15)) {
732 | _mm_prefetch((const char*)&matBT.mat[matBToffset1 + pos + 8],
733 | _MM_HINT_T0);
734 | _mm_prefetch((const char*)&matBT.mat[matBToffset2 + pos + 8],
735 | _MM_HINT_T0);
736 | _mm_prefetch((const char*)&matBT.mat[matBToffset3 + pos + 8],
737 | _MM_HINT_T0);
738 | }
739 | }
740 |
741 | a = _mm256_load_ps(&matA.mat[matAoffset4 + pos]);
742 | c10 = _mm256_fmadd_ps(a, b1, c10);
743 | c11 = _mm256_fmadd_ps(a, b2, c11);
744 | c12 = _mm256_fmadd_ps(a, b3, c12);
745 | }
746 |
747 | /* horizontal sum */
748 | memset(&accumulate[0], 0, 12 * sizeof(float));
749 |
750 | _mm256_store_ps(&fps[0], c1);
751 | _mm256_store_ps(&fps[8], c2);
752 | _mm256_store_ps(&fps[16], c3);
753 | _mm256_store_ps(&fps[24], c4);
754 | _mm256_store_ps(&fps[32], c5);
755 | _mm256_store_ps(&fps[40], c6);
756 | _mm256_store_ps(&fps[48], c7);
757 | _mm256_store_ps(&fps[56], c8);
758 | _mm256_store_ps(&fps[64], c9);
759 | _mm256_store_ps(&fps[72], c10);
760 | _mm256_store_ps(&fps[80], c11);
761 | _mm256_store_ps(&fps[88], c12);
762 |
763 | for (int i = 0; i < 12; ++i) {
764 | for (int j = 0; j < 8; ++j) {
765 | accumulate[i] += fps[i * 8 + j];
766 | }
767 | }
768 |
769 | /* stores */
770 | matData[(row + 0) * rowSpan + col + 0] = accumulate[0];
771 | matData[(row + 0) * rowSpan + col + 1] = accumulate[1];
772 | matData[(row + 0) * rowSpan + col + 2] = accumulate[2];
773 |
774 | matData[(row + 1) * rowSpan + col + 0] = accumulate[3];
775 | matData[(row + 1) * rowSpan + col + 1] = accumulate[4];
776 | matData[(row + 1) * rowSpan + col + 2] = accumulate[5];
777 |
778 | matData[(row + 2) * rowSpan + col + 0] = accumulate[6];
779 | matData[(row + 2) * rowSpan + col + 1] = accumulate[7];
780 | matData[(row + 2) * rowSpan + col + 2] = accumulate[8];
781 |
782 | matData[(row + 3) * rowSpan + col + 0] = accumulate[9];
783 | matData[(row + 3) * rowSpan + col + 1] = accumulate[10];
784 | matData[(row + 3) * rowSpan + col + 2] = accumulate[11];
785 | }
786 |
787 | /*
788 | * Compute L2Y x L2X sized blocks from the output matrix C.
789 | * In order to keep this code nice and hot in instruction cache,
790 | * keep it restricted to full blocks of L2X x L2Y.
791 | */
792 | __declspec(noalias) void MMHelper_MultL2Blocks(float* __restrict const matData,
793 | const unsigned rowSpan, const Mat& matA,
794 | const Mat& matBT, const unsigned col,
795 | const unsigned row,
796 | const unsigned L2BlockX,
797 | const unsigned L2BlockY)
798 | {
799 | /* multiply 4x3 blocks, L2blockX == 3*k, L2blockY == 4*m */
800 | for (int blockRow = row; blockRow < row + L2BlockY; blockRow += 4) {
801 | for (int blockCol = col; blockCol < col + L2BlockX; blockCol += 3) {
802 | MMHelper_Mult4x3Blocks(matData, rowSpan, matA, matBT, blockCol, blockRow);
803 | }
804 | }
805 | }
806 |
807 | /* Compute K x K sized blocks from the output matrix C. see struct mmBlockInfo */
808 | __declspec(noalias) void MMHelper_MultFullBlocks(float* __restrict const matData,
809 | const unsigned rowSpan,
810 | const Mat& matA, const Mat& matBT,
811 | const unsigned colC,
812 | const unsigned rowC,
813 | const MMBlockInfo& mmBlockInfo)
814 | {
815 | const unsigned L2BlockX = mmBlockInfo.L2BlockX, L2BlockY = mmBlockInfo.L2BlockY,
816 | L3BlockX = mmBlockInfo.L3BlockX, L3BlockY = mmBlockInfo.L3BlockY,
817 | issuedBlockSzX = mmBlockInfo.issuedBlockSzX,
818 | issuedBlockSzY = mmBlockInfo.issuedBlockSzY;
819 |
820 | /* try to prefetch next bit of block into memory while still handling this one */
821 | {
822 | if constexpr (doL3Prefetch) {
823 | std::unique_lock lock(prefetchMutex);
824 | int alreadyPrefetchedCol =
825 | prefetched[rowC / L3BlockY][colC / issuedBlockSzX];
826 | lock.unlock();
827 | if (!alreadyPrefetchedCol) {
828 | for (int c = colC + issuedBlockSzX; c < colC + issuedBlockSzX; ++c) {
829 | for (int pos = 0; pos < matA.rowSpan;
830 | pos += cacheLineSz / sizeof(float)) {
831 | _mm_prefetch((const char*)&matBT.mat[c * matBT.rowSpan + pos],
832 | _MM_HINT_T2);
833 | }
834 | }
835 | lock.lock();
836 | prefetched[rowC / L3BlockY][colC / issuedBlockSzX]++;
837 | lock.unlock();
838 | }
839 | }
840 | }
841 |
842 | /* multiply L2YxL2X blocks */
843 | for (int blockColC = colC; blockColC < colC + issuedBlockSzX;
844 | blockColC += L2BlockX) {
845 | for (int blockRowC = rowC; blockRowC < rowC + issuedBlockSzY;
846 | blockRowC += L2BlockY) {
847 | MMHelper_MultL2Blocks(matData, rowSpan, matA, matBT, blockColC, blockRowC,
848 | L2BlockX, L2BlockY);
849 | }
850 | }
851 | }
852 |
853 | /*
854 | * This function divides the matrix multiplication into segments and
855 | * issues commands for a cache aware thread pool to handle them.
856 | * Uses the helper functions above.
857 | */
858 | __declspec(noalias) const Mat MTMatMul(const Mat& matA, const Mat& matB)
859 | {
860 | /* if CPU information is not already queried, do so */
861 | if (!CPUInfoQueried) {
862 | int dCaches[3];
863 | int iCache;
864 |
865 | CPUUtil::GetCacheInfo(&dCaches[0], iCache);
866 |
867 | L2Size = dCaches[1];
868 | L3Size = dCaches[2];
869 |
870 | cacheLineSz = CPUUtil::GetCacheLineSize();
871 |
872 | CPUInfoQueried++;
873 | }
874 |
875 | /* allocate the aligned float array for our new matrix C */
876 | float* __restrict const matData =
877 | (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
878 |
879 | /* construct matrix C */
880 | Mat matC{matB.width, matA.height, matB.rowSpan, matData};
881 |
882 | /* for the sake of cache, we'll be working with transposed B */
883 | const Mat matBT = TransposeMat(matB);
884 |
885 | /* initialize the HWLocalThreadPool with 1 or 2 threads per physical core
886 | * for all physical cores. Number of threads per core depends on HTT status. */
887 | const int HTTEnabled = CPUUtil::GetHTTStatus();
888 | const int jobStride = (1 << HTTEnabled);
889 | HWLocalThreadPool tp(0, jobStride);
890 |
891 | /* decide the block sizes for the given matrix and CPU */
892 | const float invN = 1.0 / matA.rowSpan;
893 |
894 | int QL2 = invN * L2Size / sizeof(float);
895 | int QL3 = invN * L3Size / sizeof(float);
896 | int k = min(max(QL2 / 6, 1), 10);
897 | int m = min(max(QL2 / 8, 1), 10);
898 | int L2BlockX = 3 * k;
899 | int L2BlockY = 4 * m;
900 | int lcmMN = std::lcm(k, m);
901 | int L3BlockX = min(max(QL3 / 120 / lcmMN * lcmMN * 60, 12*L2BlockX), 360);
902 | int L3BlockY = L3BlockX;
903 | int issuedBlockSzX = L3BlockX / 4;
904 | int issuedBlockSzY = L3BlockY / 3;
905 |
906 | /*printf("%d %d\n%d %d %d %d %d %d\n", matC.height, matC.width, L2BlockX, L2BlockY, issuedBlockSzX, issuedBlockSzY,
907 | L3BlockX, L3BlockY);*/
908 |
909 | MMBlockInfo mmBlockInfo{L3BlockX, L3BlockY, L2BlockX,
910 | L2BlockY, issuedBlockSzX, issuedBlockSzY};
911 |
912 | /* before we begin, start prefetching the first L3 level block */
913 | /* reset the prefetched flags */
914 | memset(&prefetched[0][0], 0, 1024 * 1024 * sizeof(int));
915 | /* prefetch rows of A and columns of B, one cache line at a time */
916 | for (int r = 0; r < L3BlockY; ++r) {
917 | for (int pos = 0; pos < matA.rowSpan; pos += cacheLineSz / sizeof(float)) {
918 | _mm_prefetch((const char*)&matA.mat[r * matA.rowSpan + pos], _MM_HINT_T2);
919 | }
920 | }
921 | for (int c = 0; c < L3BlockX; ++c) {
922 | for (int pos = 0; pos < matA.rowSpan; pos += cacheLineSz / sizeof(float)) {
923 | _mm_prefetch((const char*)&matBT.mat[c * matBT.rowSpan + pos], _MM_HINT_T2);
924 | }
925 | }
926 | /* prefetch is called for the first block, mark it. */
927 | prefetched[0][0]++;
928 |
929 | /* start issuing jobs for the thread pool */
930 |
931 | /*
932 | * We incorporate multiple levels of tiling into our traversal.
933 | *
934 | * If we issue commands linearly, we'll have poor L3 cache utilization.
935 | * [ [C0T0 | C0T1] [C1T0 | C1T1] ... [C5T0 | C5T1] ] covering a rows, b columns,
936 | * (a+b)N floats of data is needed to compute a*b sized block.
937 | * So, instead, we issue commands in the blocked manner, like:
938 | * [ [C0T0 | C0T1] [C1T0 | C1T1]
939 | * [C2T0 | C5T1] [C2T0 | C2T1] ]
940 | *
941 | * Traverse L3 sized blocks,
942 | * inside each, issue issuedBlockSz sized blocks.
943 | */
944 |
945 | int rowC = 0;
946 | /* handle L3Y sized rows
947 | * cast unsigned dimensions to signed to avoid UB */
948 | for (; rowC <= (int)matA.height - L3BlockY; rowC += L3BlockY) {
949 | int colC = 0;
950 | /* handle L3Y x L3X sized blocks */
951 | for (; colC <= (int)matB.width - L3BlockX; colC += L3BlockX) {
952 | /* Issue issuedBlockSzY x issuedBlockSzX sized blocks */
953 | for (int blockRowC = rowC; blockRowC < rowC + L3BlockY;
954 | blockRowC += issuedBlockSzY) {
955 | for (int blockColC = colC; blockColC < colC + L3BlockX;
956 | blockColC += jobStride * issuedBlockSzX) {
957 | tp.Add({
958 | HWLocalThreadPool::WrapFunc(MMHelper_MultFullBlocks, matData,
959 | matB.rowSpan, matA, matBT, blockColC,
960 | blockRowC, mmBlockInfo),
961 | HWLocalThreadPool::WrapFunc(MMHelper_MultFullBlocks, matData,
962 | matB.rowSpan, matA, matBT,
963 | blockColC + issuedBlockSzX,
964 | blockRowC, mmBlockInfo)
965 | });
966 | }
967 | }
968 | }
969 | /* handle the block w < L3X, h = L3Y at the end of the row */
970 | if (matB.width > colC) {
971 | const unsigned remSubX = (matB.width - colC) >> HTTEnabled;
972 | tp.Add({
973 | HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData,
974 | matB.rowSpan, matA, matBT, colC, rowC,
975 | remSubX, L3BlockY, mmBlockInfo),
976 | HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData,
977 | matB.rowSpan, matA, matBT,
978 | colC + remSubX, rowC,
979 | matB.width - colC - remSubX, L3BlockY,
980 | mmBlockInfo)
981 | });
982 | }
983 | }
984 | /* handle last row, h < L3Y */
985 | int colC = 0;
986 | /* first handle blocks of w = L3X, h < L3Y */
987 | for (; colC <= (int)matB.width - L3BlockX; colC += jobStride * issuedBlockSzX) {
988 | tp.Add({
989 | HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData,
990 | matB.rowSpan, matA, matBT, colC,
991 | rowC, issuedBlockSzX, matA.height - rowC,
992 | mmBlockInfo),
993 | HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData,
994 | matB.rowSpan, matA, matBT,
995 | colC + issuedBlockSzX, rowC, issuedBlockSzX,
996 | matA.height - rowC, mmBlockInfo)});
997 | }
998 | /* now handle the rightmost block of w < L3X, h < L3Y */
999 | tp.Add({HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData, matB.rowSpan,
1000 | matA, matBT, colC, rowC, matB.width - colC,
1001 | matA.height - rowC, mmBlockInfo),
1002 | []() {}});
1003 |
1004 | /* -- commands issued -- */
1005 |
1006 | /* wait for the thread pool to finish */
1007 | tp.Close();
1008 | /* free the temporary bT matrix */
1009 | _aligned_free(matBT.mat);
1010 |
1011 | return matC;
1012 | }
1013 |
1014 | /* MatMul function, a simple branch that calls the proper implementation
1015 | * based on the complexity of the input matrix. */
1016 | const Mat MatMul(const Mat& matA, const Mat& matB)
1017 | {
1018 | /*
1019 | * If complexity is low enough,
1020 | * use the single threaded, transposed B method.
1021 | * A(N, M) B(M, K) => # of ops ~= 2*N*K*M
1022 | */
1023 | if (matA.height * matA.width * matB.width < 350 * 350 * 350) {
1024 | return ST_TransposedBMatMul(matA, matB);
1025 | }
1026 | return MTMatMul(matA, matB);
1027 | }
1028 |
1029 | int __cdecl main(int argc, char* argv[])
1030 | {
1031 | if (argc < 4) {
1032 | std::cout << "No args\n";
1033 | return 0;
1034 | }
1035 |
1036 | /* make sure the runtime system supports AVX and FMA ISAs */
1037 | assert(CPUUtil::GetSIMDSupport());
1038 |
1039 | const char* inputMtxAFile = argv[1];
1040 | const char* inputMtxBFile = argv[2];
1041 | const char* outMtxABFile = argv[3];
1042 |
1043 | //const char* inputMtxAFile = "matrixAx.bin";
1044 | //const char* inputMtxBFile = "matrixBx.bin";
1045 | //const char* outMtxABFile = "matrixAB-out.bin";
1046 |
1047 | const Mat inputMtxA = LoadMat(inputMtxAFile);
1048 | const Mat inputMtxB = LoadMat(inputMtxBFile);
1049 |
1050 | /*printf("%d %d %d %d\n", inputMtxA.height, inputMtxA.width, inputMtxB.height,
1051 | inputMtxB.width);*/
1052 |
1053 | auto start = std::chrono::high_resolution_clock::now();
1054 | const Mat outMtxAB = MatMul(inputMtxA, inputMtxB);
1055 | auto end = std::chrono::high_resolution_clock::now();
1056 |
1057 | std::cout
1058 | << "Matrix Multiplication: "
1059 | << std::chrono::duration_cast(end - start).count()
1060 | << " microseconds.\n";
1061 |
1062 | DumpMat(outMtxABFile, outMtxAB);
1063 |
1064 | FreeMat(inputMtxA);
1065 | FreeMat(inputMtxB);
1066 | FreeMat(outMtxAB);
1067 |
1068 | return 0;
1069 | }
1070 |
--------------------------------------------------------------------------------
/MatrixMult/MatrixMult.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 | Debug
14 | x64
15 |
16 |
17 | Release
18 | x64
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 | 15.0
31 | {54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}
32 | MatrixMult
33 | 10.0.16299.0
34 |
35 |
36 |
37 | Application
38 | true
39 | v141
40 | MultiByte
41 |
42 |
43 | Application
44 | false
45 | v141
46 | true
47 | MultiByte
48 |
49 |
50 | Application
51 | true
52 | v141
53 | MultiByte
54 |
55 |
56 | Application
57 | false
58 | v141
59 | true
60 | MultiByte
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 | Level3
84 | MaxSpeed
85 | true
86 | true
87 | true
88 | Speed
89 | Default
90 |
91 |
92 | Console
93 |
94 |
95 |
96 |
97 | Level3
98 | Disabled
99 | true
100 | true
101 | true
102 | Default
103 | false
104 | AdvancedVectorExtensions2
105 | Speed
106 | false
107 | false
108 | true
109 | Fast
110 | /Qvec-report:2 %(AdditionalOptions)
111 | false
112 | stdcpp17
113 | SyncCThrow
114 |
115 |
116 | Console
117 |
118 |
119 |
120 |
121 | Level3
122 | MaxSpeed
123 | true
124 | true
125 | true
126 | true
127 |
128 |
129 | true
130 | true
131 | true
132 |
133 |
134 |
135 |
136 | Level3
137 | true
138 | true
139 | true
140 | true
141 | Speed
142 | AdvancedVectorExtensions2
143 | Fast
144 | true
145 | true
146 | /Qvec-report:2 /Qpar-report:2 %(AdditionalOptions)
147 | false
148 | false
149 | false
150 | stdcpp17
151 | true
152 | false
153 | true
154 | true
155 | No
156 | false
157 | false
158 |
159 | COFFEELAKE
160 | COFFEELAKE
161 | Coffeelake
162 |
163 |
164 | true
165 | true
166 | Console
167 | true
168 |
169 |
170 |
171 |
172 |
173 |
--------------------------------------------------------------------------------
/MatrixMult/MatrixMult.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {4FC737F1-C7A5-4376-A066-2A32D752A2FF}
6 | cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx
7 |
8 |
9 | {93995380-89BD-4b04-88EB-625FBE52EBFB}
10 | h;hh;hpp;hxx;hm;inl;inc;ipp;xsd
11 |
12 |
13 | {67DA6AB6-F800-4c08-8B7A-83BB121AAD01}
14 | rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms
15 |
16 |
17 |
18 |
19 | Source Files
20 |
21 |
22 | Source Files
23 |
24 |
25 |
26 |
27 | Header Files
28 |
29 |
30 | Header Files
31 |
32 |
33 |
--------------------------------------------------------------------------------
/MatrixMult/ThreadPool.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include "CPUUtil.h"
14 |
15 | /*
16 | * Thread pool that respects cache locality on HyperThreaded CPUs (WIN32 API dependent)
17 | *
18 | * Each job is described as an array of N functions. (ideal N=2 for HT)
19 | * For each job, N threads are created and assigned respective functions.
20 | * For a given job, all threads are guaranteed to be on the same physical core.
21 | * No two threads from different jobs are allowed on the same physical core.
22 | *
23 | *
24 | * Why?
25 | * When doing multithreading on cache sensitive tasks,
26 | * we want to keep threads that operate on same or contiguous memory region
27 | * on the same physical core s.t they share the same L2 cache.
28 | *
29 | * Reference: This code is influenced by writeup that explains thread pools at
30 | * https://github.com/mtrebi/thread-pool/blob/master/README.md
31 | *
32 | * Structure:
33 | * CPUUtil:
34 | * Uses Windows API to detect the number of physical cores, cache sizes
35 | * and mapping between physical and logical processors.
36 | *
37 | * HWLocalThreadPool:
38 | * Submission:
39 | * initializer list or vector of (void function (void)) of length N
40 | * where N is the num of threads that will spawn on the same core,
41 | * and, the length of the std::function array.
42 | * ith thread handles repective ith function
43 | *
44 | * Core Handlers:
45 | * We create NumHWCores many CoreHandler objects.
46 | * These objects are responsible for managing their cores.
47 | * They check the main pool for jobs, when a job is found,
48 | * if N==1 , they call the only function in the job description.
49 | * if N>1 , they assign N-1 threads on the same physical core to,
50 | * respective functions in the array. The CoreHandler is
51 | * assigned to the first function.
52 | * Once CoreHandler finishes its own task, it waits for other threads,
53 | * Then its available for new jobs, waiting to be notified by the pool manager.
54 | *
55 | * Thread Handlers:
56 | * Responsible for handling tasks handed away by the CoreHandler.
57 | * When they finish execution, they signal to notify CoreHandler
58 | * Then, they wait for a new task to run until they are terminated.
59 | *
60 | * Notes:
61 | *
62 | * DON'T KEEP THESE TASKS TOO SMALL.
63 | * We don't want our CoreHandler to check its childrens states constantly,
64 | * So, when a thread finishes a task, we signal the CoreHandler.
65 | * This might become a overhead if the task itself is trivial.
66 | * In that case you probably shouldn't be using this structure anyways,
67 | * But if you want to, you can change it so that,
68 | * CoreHandler periodically checks m_childThreadOnline array and sleeps in between.
69 | *
70 | */
71 |
72 | class HWLocalThreadPool {
73 | public:
74 | HWLocalThreadPool(int _numOfCoresToUse, int _numThreadsPerCore) : m_terminate(false)
75 | {
76 | m_numHWCores = CPUUtil::GetNumHWCores();
77 |
78 | if (_numOfCoresToUse <= 0) {
79 | m_numCoreHandlers = m_numHWCores;
80 | } else {
81 | m_numCoreHandlers = _numOfCoresToUse;
82 | }
83 |
84 | if (_numThreadsPerCore <= 0) {
85 | m_numThreadsPerCore =
86 | CPUUtil::GetNumLogicalProcessors() / m_numCoreHandlers;
87 | } else {
88 | m_numThreadsPerCore = _numThreadsPerCore;
89 | }
90 |
91 | /* malloc m_coreHandlers s.t no default initialization takes place,
92 | we construct every object with placement new */
93 | m_coreHandlers = (CoreHandler*)malloc(m_numCoreHandlers * sizeof(CoreHandler));
94 | m_coreHandlerThreads = new std::thread[m_numCoreHandlers];
95 |
96 | for (int i = 0; i < m_numCoreHandlers; ++i) {
97 | ULONG_PTR processAffinityMask;
98 | int maskQueryRetCode = CPUUtil::GetProcessorMask(i, processAffinityMask);
99 | if (maskQueryRetCode) {
100 | assert(0, "Can't query processor relations.");
101 | return;
102 | }
103 | CoreHandler* coreHandler =
104 | new (&m_coreHandlers[i]) CoreHandler(this, i, processAffinityMask);
105 | m_coreHandlerThreads[i] = std::thread(std::ref(m_coreHandlers[i]));
106 | }
107 | }
108 |
109 | ~HWLocalThreadPool()
110 | {
111 | if (!m_terminate)
112 | Close();
113 | }
114 |
115 | void Add(std::vector> const& F)
116 | {
117 | m_queue.Push(F);
118 | m_queueToCoreNotifier.notify_one();
119 | }
120 |
121 | /* if finishQueue is set, cores will termianate after handling every job at the queue
122 | if not, they will finish the current job they have and terminate. */
123 | void Close(const bool finishQueue = true)
124 | {
125 | {
126 | std::unique_lock lock(m_queueMutex);
127 | m_terminate = 1;
128 | m_waitToFinish = finishQueue;
129 | m_queueToCoreNotifier.notify_all();
130 | }
131 |
132 | for (int i = 0; i < m_numCoreHandlers; ++i) {
133 | if (m_coreHandlerThreads[i].joinable())
134 | m_coreHandlerThreads[i].join();
135 | }
136 |
137 | /* free doesn't call the destructor, so */
138 | for (int i = 0; i < m_numCoreHandlers; ++i) {
139 | m_coreHandlers[i].~CoreHandler();
140 | }
141 | free(m_coreHandlers);
142 | delete[] m_coreHandlerThreads;
143 | }
144 |
145 | const unsigned NumCores()
146 | {
147 | return m_numHWCores;
148 | }
149 |
150 | const unsigned NumThreadsPerCore()
151 | {
152 | return m_numThreadsPerCore;
153 | }
154 |
155 | template
156 | static std::function WrapFunc(F&& f, Args&&... args)
157 | {
158 | std::function func =
159 | std::bind(std::forward(f), std::forward(args)...);
160 | auto task_ptr =
161 | std::make_shared>(func);
162 |
163 | std::function wrapper_func = [task_ptr]() { (*task_ptr)(); };
164 |
165 | return wrapper_func;
166 | }
167 |
168 | protected:
169 | template class Queue {
170 | public:
171 | Queue()
172 | {
173 | }
174 | ~Queue()
175 | {
176 | }
177 |
178 | void Push(T const& element)
179 | {
180 | std::unique_lock lock(m_mutex);
181 | m_queue.push(std::move(element));
182 | }
183 |
184 | bool Pop(T& function)
185 | {
186 | std::unique_lock lock(m_mutex);
187 | if (!m_queue.empty()) {
188 | function = std::move(m_queue.front());
189 | m_queue.pop();
190 | return true;
191 | }
192 | return false;
193 | }
194 |
195 | int Size()
196 | {
197 | std::unique_lock lock(m_mutex);
198 | return m_queue.size();
199 | }
200 |
201 | private:
202 | std::queue m_queue;
203 | std::mutex m_mutex;
204 | };
205 |
206 | class CoreHandler {
207 | public:
208 | CoreHandler(HWLocalThreadPool* const _parent, const unsigned _id,
209 | const ULONG_PTR& _processorMask)
210 | : m_parent(_parent), m_id(_id), m_processorAffinityMask(_processorMask),
211 | m_terminate(false), m_numChildThreads(_parent->m_numThreadsPerCore - 1)
212 | {
213 | if (m_numChildThreads > 0) {
214 | m_childThreads = new std::thread[m_numChildThreads];
215 | m_childThreadOnline = new bool[m_numChildThreads];
216 | std::unique_lock lock(m_threadMutex);
217 | for (int i = 0; i < m_numChildThreads; ++i) {
218 | m_childThreadOnline[i] = 0;
219 | m_childThreads[i] =
220 | std::thread(ThreadHandler(this, i, m_processorAffinityMask));
221 | }
222 | }
223 | }
224 |
225 | void WaitForChildThreads()
226 | {
227 | if (!m_childThreads || m_numChildThreads < 1)
228 | return;
229 |
230 | std::unique_lock lock(m_threadMutex);
231 | bool anyOnline = 1;
232 | while (anyOnline) {
233 | anyOnline = 0;
234 | for (int i = 0; i < m_numChildThreads; ++i) {
235 | anyOnline |= m_childThreadOnline[i];
236 | }
237 | if (anyOnline) {
238 | m_threadToCoreNotifier.wait(lock);
239 | }
240 | }
241 | }
242 |
243 | void CloseChildThreads()
244 | {
245 | if (m_terminate || m_numChildThreads < 1)
246 | return;
247 |
248 | {
249 | std::unique_lock lock(m_threadMutex);
250 | m_terminate = 1;
251 | m_coreToThreadNotifier.notify_all();
252 | }
253 |
254 | /* Core closing threads */
255 | for (int i = 0; i < m_numChildThreads; ++i) {
256 | if (m_childThreads[i].joinable()) {
257 | m_childThreads[i].join();
258 | }
259 | }
260 |
261 | delete[] m_childThreads;
262 | delete[] m_childThreadOnline;
263 | }
264 |
265 | void operator()()
266 | {
267 | SetThreadAffinityMask(GetCurrentThread(), m_processorAffinityMask);
268 | bool dequeued;
269 | while (1) {
270 | {
271 | std::unique_lock lock(m_parent->m_queueMutex);
272 | if (m_parent->m_terminate &&
273 | !(m_parent->m_waitToFinish && m_parent->m_queue.Size() > 0)) {
274 | break;
275 | }
276 | if (m_parent->m_queue.Size() == 0) {
277 | m_parent->m_queueToCoreNotifier.wait(lock);
278 | }
279 | dequeued = m_parent->m_queue.Pop(m_job);
280 | }
281 | if (dequeued) {
282 | m_ownJob = std::move(m_job[0]);
283 | if (m_numChildThreads < 1) {
284 | m_ownJob();
285 | } else {
286 | {
287 | std::unique_lock lock(m_threadMutex);
288 | for (int i = 0; i < m_numChildThreads; ++i) {
289 | m_childThreadOnline[i] = 1;
290 | }
291 | m_coreToThreadNotifier.notify_all();
292 | }
293 |
294 | m_ownJob();
295 |
296 | WaitForChildThreads();
297 | }
298 | }
299 | }
300 | CloseChildThreads();
301 | }
302 |
303 | class ThreadHandler {
304 | public:
305 | ThreadHandler(CoreHandler* _parent, const unsigned _id,
306 | const ULONG_PTR& _processorAffinityMask)
307 | : m_parent(_parent), m_processorAffinityMask(_processorAffinityMask),
308 | m_id(_id), m_jobSlot(_id + 1)
309 | {
310 | }
311 |
312 | void operator()()
313 | {
314 | SetThreadAffinityMask(GetCurrentThread(), m_processorAffinityMask);
315 | while (1) {
316 | {
317 | std::unique_lock lock(m_parent->m_threadMutex);
318 | if (m_parent->m_terminate)
319 | break;
320 | if (!m_parent->m_childThreadOnline[m_id]) {
321 | m_parent->m_coreToThreadNotifier.wait(lock);
322 | }
323 | }
324 | bool online = 0;
325 | {
326 | std::unique_lock lock(m_parent->m_threadMutex);
327 | online = m_parent->m_childThreadOnline[m_id];
328 | }
329 | if (online) {
330 | func = std::move(m_parent->m_job[m_jobSlot]);
331 | func();
332 | std::unique_lock lock(m_parent->m_threadMutex);
333 | m_parent->m_childThreadOnline[m_id] = 0;
334 | m_parent->m_threadToCoreNotifier.notify_one();
335 | }
336 | }
337 | }
338 |
339 | const unsigned m_id;
340 | const unsigned m_jobSlot;
341 | CoreHandler* m_parent;
342 | ULONG_PTR m_processorAffinityMask;
343 | std::function func;
344 | };
345 |
346 | const unsigned m_id;
347 | HWLocalThreadPool* const m_parent;
348 | const ULONG_PTR m_processorAffinityMask;
349 | const unsigned m_numChildThreads;
350 |
351 | std::thread* m_childThreads;
352 | bool* m_childThreadOnline;
353 | bool m_terminate;
354 |
355 | std::vector> m_job;
356 | std::function m_ownJob;
357 |
358 | std::mutex m_threadMutex;
359 | std::condition_variable m_coreToThreadNotifier;
360 | std::condition_variable m_threadToCoreNotifier;
361 | };
362 |
363 | private:
364 | unsigned m_numHWCores, m_numCoreHandlers, m_numThreadsPerCore;
365 | CoreHandler* m_coreHandlers;
366 | std::thread* m_coreHandlerThreads;
367 |
368 | Queue>> m_queue;
369 |
370 | bool m_terminate, m_waitToFinish;
371 |
372 | std::mutex m_queueMutex;
373 | std::condition_variable m_queueToCoreNotifier;
374 | };
375 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Multithreaded, Lightning fast Matrix-Matrix Multiplication
2 |
3 | [See CHANGELOG](#changelog)
4 |
5 | [What's next?](#whats-next)
6 |
7 | In this project, I’ve implemented multiple methods for multiplying
8 | matrices, and relevant utilities. My prime focuses were:
9 |
10 | - Cache locality, memory access patterns.
11 |
12 | - SIMD, hand optimized AVX/FMA intrinsics.
13 |
14 | - Software prefetching to maximize pipeline utilization.
15 |
16 | - Cache friendly multithreading.
17 |
18 | I didn’t implement the Strassen’s algorithm, this code runs on O(N^3).
19 |
20 | # How to run
21 |
22 | **Requirements:**
23 | * Windows platform
24 | * 64-bit Intel CPU with AVX / FMA support
25 |
26 | Currently, if you're looking to use this code, just copy and include CPUUtils.\* ThreadPool.h and copy the contents of MatrixMul.cpp except main() into a namespace, the code should be ready to compile as a header only library. Will tidy up the code into a proper library soon.
27 |
28 | Note that this program relies on Intel specifix cpuid responses and intrinsics and Win32 API for logical-physical processor mapping and setting thread affinity.
29 |
30 | Running the example code:
31 | Build the solution (see build options), then navigate to *x64\\Release\\* and run this command or call “run.bat”. If
32 | you don’t have “tee” command, just delete the last part or install
33 | GnuWin32 CoreUtils.
34 |
35 | ``` bash
36 | for /l %x in (1, 1, 100) do echo %x && (MatrixGenerator.exe && printf "Generated valid output. Testing...\n" && MatrixMult.exe matrixA.bin matrixB.bin matrixAB-out.bin && printf \n\n ) | tee -a out.txt
37 | ```
38 |
39 | # Benchmarks
40 |
41 | On my machine (6 core i7-8700K), I’ve compared my implementation against:
42 |
43 | * Eigen library (with all the compiler optimizations turned on)
44 | * I've tested both Eigen's own implementation and Eigen compiled with MKL+TBB backend, runtime analysis shows that the benchmark indeed uses MKL kernel for matrix multiplication and Eigen doesn't introduce any overheads.
45 | * Multithreaded python-numpy which uses C/C++ backend and Intel MKL BLAS
46 | library. The code can be found under the Benchmarks folder, however the graph below doesn't include it as it was consistently slower than Eigen(MKL+TBB)
47 |
48 | ## Comparison
49 |
50 | Current implementation runs identically or slightly faster than Eigen (MKL+TBB) for all test cases (tested up to N=15K)! Intel Advisor and VTune clearly shows that MKL kernel *avx2_dgemm_kernel_0* is used and no abnormal overheads are present.
51 |
52 | 
53 |
54 | ## Multithreading utilities ([ThreadPool.h](https://github.com/talhasaruhan/cpp-matmul/blob/master/MatrixMult/ThreadPool.h))
55 |
56 | ``` c++
57 | Namespace CPUUtil,
58 | HWLocalThreadPool(NumOfCoresToUse, NumThreadsPerCore)
59 | ```
60 |
61 | CPUUtil namespace has utility functions for querying runtime system for logical-physical processor mapping, cache sizes, cache line size, hyperthreading, AVX/FMA instruction set support and few more.
62 |
63 | I’ve also implemented a hardware local thread pool to handle jobs for multithreaded
64 | *MTMatMul* function. The pool runs every thread corresponding to a job
65 | on the same physical core. Idea is that, on hyperthreaded systems such
66 | as mine, 2 threads that work on contiguous parts of memory should live
67 | on the same core and share the same L1 and L2 cache.
68 |
69 | - Each job is described as an array of N functions. (N=2)
70 |
71 | - For each job, N threads (that were already created) are assigned respective
72 | functions.
73 |
74 | - For a given job, all threads are guaranteed to be on the same
75 | physical core.
76 |
77 | - No two threads from different jobs are allowed on the same physical
78 | core.
79 |
80 | ## MSVC2017 Build options (over default x64 Release build settings)
81 |
82 | - Maximum optimization: /O2
83 |
84 | - Favor fast code /Ot
85 |
86 | - Enable function level linking: /Gy
87 |
88 | - Enable enhanced instruction set: /arch:AVX2
89 |
90 | - Floating point model: /fp:fast
91 |
92 | - Language: /std:c++17 (for several “if constexpr”s, and one std::lcm. otherwise can be
93 | compiled with C++ 11)
94 |
95 | # What's next?
96 | * ~~Still a factor of 2 to achieve MKL performance.~~ Achieved and surpassed Eigen(MKL+TBB) performance for most test cases N<15K. Test and optimize for larger matrices.
97 | * Right now, when the prefetch switches are enabled, instruction retirement rate is about 88%, and the program is neither front-end nor back-end bound, it has excellent pipeline utilization. When the switches are disabled, the retirement rate drops to about 50%, and the program is heavily memory bound, pipelines are heavily stalled due to these bounds. However, on my current system (i7 8700K), binary without prefetching actually computes the output significantly faster (15%). I think this behaviour will heavily rely on the specific CPU, its cache size and performance. Try this on other hardware with different cache performances and varying matrix sizes.
98 | * Wrap the functionality in a replicable and distributable framework that's easy to use.
99 |
100 | # Changelog
101 |
102 | **Note:** Debugging builds will have arguments pre-set on the MatrixMul.cpp, you can ignore or revert those to accept argument from command line.
103 |
104 | ### 27/11/2018
105 | * Cleaned up the code. Split some behaviours into seperate functions.
106 | * Implemented runtime detection for best block size parameters for the runtime system.
107 | * Tuned software prefetching, now we do multiple smaller prefetches in between arithmetic operations and with a stride between prefetches.
108 | * More arithmetically dense inner loop. Instead of 3x3 blocks, do 4x3 blocks (3b + 12c + 1 temporary a == 16 registers used), 7 loads, 12 arithmetic operations.
109 | * HWLocalThreadPool takes number of cores and threads per core as contructor arguments and is not templated anymore. It never should have been.
110 | * Renamed QueryHWCores namespace to CPUUtils and extended it to support querying cache sizes, HTT/AVX/FMA support etc. using \_\_cpuid.
111 |
112 | ### 15/11/2018
113 | * Implemented **one more level of blocking**, first block holds data in L3 while the second holds the data in L2. To avoid the "job" overhead in thread pool system and to allow for explicit software prefetching, threads groups handle the highest level of blocks. (If the job was issued on lower level blocks, the threads need explicit syncing so that they only issue prefetch command once per L3 block.)
114 | * Implemented **software prefetching**. Now while an L3 block is being computed, next one is loaded into the memory in an asynchronous manner. May implement a similar feature for L2 level blocks later on.
115 | * **Removed** all but one of the *MMHelper_MultBlocks* implementations.
116 | * **Converted** AVX multiply and add intrinsics to **fused multiply add intrinsics** from FMA set.
117 | * **Now the MultBlocks use the loaded __m256 vectors as long as possible without unloading and loading a new one.** Just like we keep same values in cache and use them as much as possible without unloading, this is the the same idea applied to **YMM registers**. This increased Arithmetic Intensity (FLOP/L1 Transferred Bytes) metric from 0.25 to 0.67, speeding up the entire matrix multiplication by the same ratio.
118 | * Now fully integrated **VTune** into my workflow to analyze the application.
119 |
120 | ### 13/11/2018
121 | Long and detailed work journal, click to expand
122 |
123 |
124 |
Added a couple of vector sum implementations in benchmark project to compare different intrinsic approaches. The aim is to achieve maximum throughput with ILP minded design. However compiler optimizes away different ways in which I try to maximize the throughput for my own specific CPU architecture.
125 |
In order to address this issue, I wrote another benchmark with inline assembly and compiled it with GCC (as MSVC doesn't support inline assembly in x64 architecture). First of all, I tested GCC's behaviour with intrinsics and found it to be same as MSVC's for our purposes. Having shown that, I've written volatile inline assembly to force compiler to use my implementation. The tests showed that the compiler optimized the intrinsics to almost the same level when the optimizations are enabled. But compiler optimized versions, and my ASM code, is still not fast enough to compete with BLAS packages. So I'm doing something wrong in the first place and writing ASM is not the answer.
126 |
Benchmarked auto vectorization, naive intrinsics and other 2 intrinsic based block multiplication implementations, last 2 methods are about 15% faster than naive intrinsics and auto vectorized code. But arithmetic intensity (FLOPs / memory accesses) is still quite low.
127 |
Started analyzing the bottlenecks further using **Intel's VTune and Advisor**. It now became apparent that while I was getting similar results from different approaches, each had **different bottlenecks** which at first I couldn't see. So with this detailed information I should be able to address those bottlenecks.
128 |
Added another intrinsic based block multiplication method, changed a few implementations to use **FMA** intructions rather than seperate multiply-adds, to achieve higher throughput.
129 |
When profiling my program I noticed that small block sizes that can fit into L2 cache yielded a lot of L3 misses and large blocks that utilized L3 well and cut down the DRAM fetches, ran into L2 misses. So applying the idea that led to blocking to begin with, I will implement **one more level of blocking** to better utilize multiple layers of cache.
130 |
131 |
132 |
133 |
134 | ### 09/11/2018
135 | * **Fixed memory leaks!**
136 |
137 | Screenshot of memory usage analysis
138 |
139 |
140 |
141 | (This is the heap profile of the program after running C1 = AB, freeing C1, then running C2=AB and freeing C2. As can be seen here, all the previously leaked mess (packed tasks, function pointers, CoreHandler member arrays etc. ) is now cleaned up nicely. Note: int[] is the static CPU core to logical processor map,)
142 |
143 | * **Properly called destructors** where CoreHandler objects are created using placement new into a malloc'ed buffer.
144 | * **Freed BT.mat** (transpose of B) in the methods that use it to convert the problem into row-row dot product.
145 | * ~~Changed Add function s.t it accepts std::shared_ptr[]>, this is only temporary.~~
146 | * **Changed the Add() semantics**, now Add function accepts a std::vector>. Preferred way of using Add() function now is with initializer lists:
147 |
148 | ```
149 | tp.Add({
150 | HWLocalThreadPool<>::WrapFunc(MMHelper_MultBlocks,
151 | matData, subX, matA.height - rowC, rowC, colC, matA, matB, matBT) ,
152 | HWLocalThreadPool<>::WrapFunc(MMHelper_MultBlocks,
153 | matData, subX, matA.height - rowC, rowC, colC + subX, matA, matB, matBT)
154 | });
155 | ```
156 | * Added Eigen benchmarks
157 | * Implemented MatMul which should be the general function exposed to outside. It simply selects betwen *MTMatMul* and *ST_TransposedBMatMul* depending on the sizes of the matrices. Current impl.: ```A.height*A.width*A.width*B.width < K : ST_TransposedBMatMul o.w : MTMatMul```
158 |
159 |
--------------------------------------------------------------------------------
/benchmark.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talhasaruhan/cpp-matmul/e1ef1edf935d5af6d79de15b127d1e8ad13f284c/benchmark.xlsx
--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
1 | cd x64/Release/ && run.bat
--------------------------------------------------------------------------------