├── .clang-format
├── .gitattributes
├── .gitignore
├── Benchmark1.png
├── Benchmarks
    ├── Benchmarks.vcxproj
    ├── Benchmarks.vcxproj.filters
    ├── EigenBenchmark.cpp
    ├── IntrinASMDotBenchmark.cpp
    ├── IntrinsicSumBenchmarks.cpp
    └── NumpyBenchmark.py
├── MatrixGenerator
    ├── MatrixGenerator.cpp
    ├── MatrixGenerator.vcxproj
    └── MatrixGenerator.vcxproj.filters
├── MatrixMult.sln
├── MatrixMult
    ├── CPUUtil.cpp
    ├── CPUUtil.h
    ├── MatrixMul.cpp
    ├── MatrixMult.vcxproj
    ├── MatrixMult.vcxproj.filters
    └── ThreadPool.h
├── README.md
├── benchmark.xlsx
└── run.bat


/.clang-format:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: GPL-2.0
  2 | #
  3 | # clang-format configuration file. Intended for clang-format >= 4.
  4 | #
  5 | # For more information, see:
  6 | #
  7 | #   Documentation/process/clang-format.rst
  8 | #   https://clang.llvm.org/docs/ClangFormat.html
  9 | #   https://clang.llvm.org/docs/ClangFormatStyleOptions.html
 10 | #
 11 | ---
 12 | AccessModifierOffset: -4
 13 | AlignAfterOpenBracket: Align
 14 | AlignConsecutiveAssignments: false
 15 | AlignConsecutiveDeclarations: false
 16 | #AlignEscapedNewlines: Left # Unknown to clang-format-4.0
 17 | AlignOperands: true
 18 | AlignTrailingComments: false
 19 | AllowAllParametersOfDeclarationOnNextLine: false
 20 | AllowShortBlocksOnASingleLine: false
 21 | AllowShortCaseLabelsOnASingleLine: false
 22 | AllowShortFunctionsOnASingleLine: None
 23 | AllowShortIfStatementsOnASingleLine: false
 24 | AllowShortLoopsOnASingleLine: false
 25 | AlwaysBreakAfterDefinitionReturnType: None
 26 | AlwaysBreakAfterReturnType: None
 27 | AlwaysBreakBeforeMultilineStrings: false
 28 | AlwaysBreakTemplateDeclarations: false
 29 | BinPackArguments: true
 30 | BinPackParameters: true
 31 | BraceWrapping:
 32 |   AfterClass: false
 33 |   AfterControlStatement: false
 34 |   AfterEnum: false
 35 |   AfterFunction: true
 36 |   AfterNamespace: true
 37 |   AfterObjCDeclaration: false
 38 |   AfterStruct: false
 39 |   AfterUnion: false
 40 |   #AfterExternBlock: false # Unknown to clang-format-5.0
 41 |   BeforeCatch: false
 42 |   BeforeElse: false
 43 |   IndentBraces: false
 44 |   #SplitEmptyFunction: true # Unknown to clang-format-4.0
 45 |   #SplitEmptyRecord: true # Unknown to clang-format-4.0
 46 |   #SplitEmptyNamespace: true # Unknown to clang-format-4.0
 47 | BreakBeforeBinaryOperators: None
 48 | BreakBeforeBraces: Custom
 49 | #BreakBeforeInheritanceComma: false # Unknown to clang-format-4.0
 50 | BreakBeforeTernaryOperators: false
 51 | BreakConstructorInitializersBeforeComma: false
 52 | #BreakConstructorInitializers: BeforeComma # Unknown to clang-format-4.0
 53 | BreakAfterJavaFieldAnnotations: false
 54 | BreakStringLiterals: false
 55 | ColumnLimit: 88
 56 | CommentPragmas: '^ IWYU pragma:'
 57 | #CompactNamespaces: false # Unknown to clang-format-4.0
 58 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
 59 | ConstructorInitializerIndentWidth: 4
 60 | ContinuationIndentWidth: 2
 61 | Cpp11BracedListStyle: true
 62 | DerivePointerAlignment: false
 63 | DisableFormat: false
 64 | ExperimentalAutoDetectBinPacking: false
 65 | #FixNamespaceComments: false # Unknown to clang-format-4.0
 66 | 
 67 | # Taken from:
 68 | #   git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ \
 69 | #   | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$,  - '\1'," \
 70 | #   | sort | uniq
 71 | ForEachMacros:
 72 |   - 'apei_estatus_for_each_section'
 73 |   - 'ata_for_each_dev'
 74 |   - 'ata_for_each_link'
 75 |   - 'ax25_for_each'
 76 |   - 'ax25_uid_for_each'
 77 |   - 'bio_for_each_integrity_vec'
 78 |   - '__bio_for_each_segment'
 79 |   - 'bio_for_each_segment'
 80 |   - 'bio_for_each_segment_all'
 81 |   - 'bio_list_for_each'
 82 |   - 'bip_for_each_vec'
 83 |   - 'blkg_for_each_descendant_post'
 84 |   - 'blkg_for_each_descendant_pre'
 85 |   - 'blk_queue_for_each_rl'
 86 |   - 'bond_for_each_slave'
 87 |   - 'bond_for_each_slave_rcu'
 88 |   - 'btree_for_each_safe128'
 89 |   - 'btree_for_each_safe32'
 90 |   - 'btree_for_each_safe64'
 91 |   - 'btree_for_each_safel'
 92 |   - 'card_for_each_dev'
 93 |   - 'cgroup_taskset_for_each'
 94 |   - 'cgroup_taskset_for_each_leader'
 95 |   - 'cpufreq_for_each_entry'
 96 |   - 'cpufreq_for_each_entry_idx'
 97 |   - 'cpufreq_for_each_valid_entry'
 98 |   - 'cpufreq_for_each_valid_entry_idx'
 99 |   - 'css_for_each_child'
100 |   - 'css_for_each_descendant_post'
101 |   - 'css_for_each_descendant_pre'
102 |   - 'device_for_each_child_node'
103 |   - 'drm_atomic_crtc_for_each_plane'
104 |   - 'drm_atomic_crtc_state_for_each_plane'
105 |   - 'drm_atomic_crtc_state_for_each_plane_state'
106 |   - 'drm_for_each_connector_iter'
107 |   - 'drm_for_each_crtc'
108 |   - 'drm_for_each_encoder'
109 |   - 'drm_for_each_encoder_mask'
110 |   - 'drm_for_each_fb'
111 |   - 'drm_for_each_legacy_plane'
112 |   - 'drm_for_each_plane'
113 |   - 'drm_for_each_plane_mask'
114 |   - 'drm_mm_for_each_hole'
115 |   - 'drm_mm_for_each_node'
116 |   - 'drm_mm_for_each_node_in_range'
117 |   - 'drm_mm_for_each_node_safe'
118 |   - 'for_each_active_drhd_unit'
119 |   - 'for_each_active_iommu'
120 |   - 'for_each_available_child_of_node'
121 |   - 'for_each_bio'
122 |   - 'for_each_board_func_rsrc'
123 |   - 'for_each_bvec'
124 |   - 'for_each_child_of_node'
125 |   - 'for_each_clear_bit'
126 |   - 'for_each_clear_bit_from'
127 |   - 'for_each_cmsghdr'
128 |   - 'for_each_compatible_node'
129 |   - 'for_each_console'
130 |   - 'for_each_cpu'
131 |   - 'for_each_cpu_and'
132 |   - 'for_each_cpu_not'
133 |   - 'for_each_cpu_wrap'
134 |   - 'for_each_dev_addr'
135 |   - 'for_each_dma_cap_mask'
136 |   - 'for_each_drhd_unit'
137 |   - 'for_each_dss_dev'
138 |   - 'for_each_efi_memory_desc'
139 |   - 'for_each_efi_memory_desc_in_map'
140 |   - 'for_each_endpoint_of_node'
141 |   - 'for_each_evictable_lru'
142 |   - 'for_each_fib6_node_rt_rcu'
143 |   - 'for_each_fib6_walker_rt'
144 |   - 'for_each_free_mem_range'
145 |   - 'for_each_free_mem_range_reverse'
146 |   - 'for_each_func_rsrc'
147 |   - 'for_each_hstate'
148 |   - 'for_each_if'
149 |   - 'for_each_iommu'
150 |   - 'for_each_ip_tunnel_rcu'
151 |   - 'for_each_irq_nr'
152 |   - 'for_each_lru'
153 |   - 'for_each_matching_node'
154 |   - 'for_each_matching_node_and_match'
155 |   - 'for_each_memblock'
156 |   - 'for_each_memblock_type'
157 |   - 'for_each_memcg_cache_index'
158 |   - 'for_each_mem_pfn_range'
159 |   - 'for_each_mem_range'
160 |   - 'for_each_mem_range_rev'
161 |   - 'for_each_migratetype_order'
162 |   - 'for_each_msi_entry'
163 |   - 'for_each_net'
164 |   - 'for_each_netdev'
165 |   - 'for_each_netdev_continue'
166 |   - 'for_each_netdev_continue_rcu'
167 |   - 'for_each_netdev_feature'
168 |   - 'for_each_netdev_in_bond_rcu'
169 |   - 'for_each_netdev_rcu'
170 |   - 'for_each_netdev_reverse'
171 |   - 'for_each_netdev_safe'
172 |   - 'for_each_net_rcu'
173 |   - 'for_each_new_connector_in_state'
174 |   - 'for_each_new_crtc_in_state'
175 |   - 'for_each_new_plane_in_state'
176 |   - 'for_each_new_private_obj_in_state'
177 |   - 'for_each_node'
178 |   - 'for_each_node_by_name'
179 |   - 'for_each_node_by_type'
180 |   - 'for_each_node_mask'
181 |   - 'for_each_node_state'
182 |   - 'for_each_node_with_cpus'
183 |   - 'for_each_node_with_property'
184 |   - 'for_each_of_allnodes'
185 |   - 'for_each_of_allnodes_from'
186 |   - 'for_each_of_pci_range'
187 |   - 'for_each_old_connector_in_state'
188 |   - 'for_each_old_crtc_in_state'
189 |   - 'for_each_oldnew_connector_in_state'
190 |   - 'for_each_oldnew_crtc_in_state'
191 |   - 'for_each_oldnew_plane_in_state'
192 |   - 'for_each_oldnew_private_obj_in_state'
193 |   - 'for_each_old_plane_in_state'
194 |   - 'for_each_old_private_obj_in_state'
195 |   - 'for_each_online_cpu'
196 |   - 'for_each_online_node'
197 |   - 'for_each_online_pgdat'
198 |   - 'for_each_pci_bridge'
199 |   - 'for_each_pci_dev'
200 |   - 'for_each_pci_msi_entry'
201 |   - 'for_each_populated_zone'
202 |   - 'for_each_possible_cpu'
203 |   - 'for_each_present_cpu'
204 |   - 'for_each_prime_number'
205 |   - 'for_each_prime_number_from'
206 |   - 'for_each_process'
207 |   - 'for_each_process_thread'
208 |   - 'for_each_property_of_node'
209 |   - 'for_each_reserved_mem_region'
210 |   - 'for_each_resv_unavail_range'
211 |   - 'for_each_rtdcom'
212 |   - 'for_each_rtdcom_safe'
213 |   - 'for_each_set_bit'
214 |   - 'for_each_set_bit_from'
215 |   - 'for_each_sg'
216 |   - 'for_each_sg_page'
217 |   - '__for_each_thread'
218 |   - 'for_each_thread'
219 |   - 'for_each_zone'
220 |   - 'for_each_zone_zonelist'
221 |   - 'for_each_zone_zonelist_nodemask'
222 |   - 'fwnode_for_each_available_child_node'
223 |   - 'fwnode_for_each_child_node'
224 |   - 'fwnode_graph_for_each_endpoint'
225 |   - 'gadget_for_each_ep'
226 |   - 'hash_for_each'
227 |   - 'hash_for_each_possible'
228 |   - 'hash_for_each_possible_rcu'
229 |   - 'hash_for_each_possible_rcu_notrace'
230 |   - 'hash_for_each_possible_safe'
231 |   - 'hash_for_each_rcu'
232 |   - 'hash_for_each_safe'
233 |   - 'hctx_for_each_ctx'
234 |   - 'hlist_bl_for_each_entry'
235 |   - 'hlist_bl_for_each_entry_rcu'
236 |   - 'hlist_bl_for_each_entry_safe'
237 |   - 'hlist_for_each'
238 |   - 'hlist_for_each_entry'
239 |   - 'hlist_for_each_entry_continue'
240 |   - 'hlist_for_each_entry_continue_rcu'
241 |   - 'hlist_for_each_entry_continue_rcu_bh'
242 |   - 'hlist_for_each_entry_from'
243 |   - 'hlist_for_each_entry_from_rcu'
244 |   - 'hlist_for_each_entry_rcu'
245 |   - 'hlist_for_each_entry_rcu_bh'
246 |   - 'hlist_for_each_entry_rcu_notrace'
247 |   - 'hlist_for_each_entry_safe'
248 |   - '__hlist_for_each_rcu'
249 |   - 'hlist_for_each_safe'
250 |   - 'hlist_nulls_for_each_entry'
251 |   - 'hlist_nulls_for_each_entry_from'
252 |   - 'hlist_nulls_for_each_entry_rcu'
253 |   - 'hlist_nulls_for_each_entry_safe'
254 |   - 'ide_host_for_each_port'
255 |   - 'ide_port_for_each_dev'
256 |   - 'ide_port_for_each_present_dev'
257 |   - 'idr_for_each_entry'
258 |   - 'idr_for_each_entry_continue'
259 |   - 'idr_for_each_entry_ul'
260 |   - 'inet_bind_bucket_for_each'
261 |   - 'inet_lhash2_for_each_icsk_rcu'
262 |   - 'iov_for_each'
263 |   - 'key_for_each'
264 |   - 'key_for_each_safe'
265 |   - 'klp_for_each_func'
266 |   - 'klp_for_each_object'
267 |   - 'kvm_for_each_memslot'
268 |   - 'kvm_for_each_vcpu'
269 |   - 'list_for_each'
270 |   - 'list_for_each_entry'
271 |   - 'list_for_each_entry_continue'
272 |   - 'list_for_each_entry_continue_rcu'
273 |   - 'list_for_each_entry_continue_reverse'
274 |   - 'list_for_each_entry_from'
275 |   - 'list_for_each_entry_from_reverse'
276 |   - 'list_for_each_entry_lockless'
277 |   - 'list_for_each_entry_rcu'
278 |   - 'list_for_each_entry_reverse'
279 |   - 'list_for_each_entry_safe'
280 |   - 'list_for_each_entry_safe_continue'
281 |   - 'list_for_each_entry_safe_from'
282 |   - 'list_for_each_entry_safe_reverse'
283 |   - 'list_for_each_prev'
284 |   - 'list_for_each_prev_safe'
285 |   - 'list_for_each_safe'
286 |   - 'llist_for_each'
287 |   - 'llist_for_each_entry'
288 |   - 'llist_for_each_entry_safe'
289 |   - 'llist_for_each_safe'
290 |   - 'media_device_for_each_entity'
291 |   - 'media_device_for_each_intf'
292 |   - 'media_device_for_each_link'
293 |   - 'media_device_for_each_pad'
294 |   - 'netdev_for_each_lower_dev'
295 |   - 'netdev_for_each_lower_private'
296 |   - 'netdev_for_each_lower_private_rcu'
297 |   - 'netdev_for_each_mc_addr'
298 |   - 'netdev_for_each_uc_addr'
299 |   - 'netdev_for_each_upper_dev_rcu'
300 |   - 'netdev_hw_addr_list_for_each'
301 |   - 'nft_rule_for_each_expr'
302 |   - 'nla_for_each_attr'
303 |   - 'nla_for_each_nested'
304 |   - 'nlmsg_for_each_attr'
305 |   - 'nlmsg_for_each_msg'
306 |   - 'nr_neigh_for_each'
307 |   - 'nr_neigh_for_each_safe'
308 |   - 'nr_node_for_each'
309 |   - 'nr_node_for_each_safe'
310 |   - 'of_for_each_phandle'
311 |   - 'of_property_for_each_string'
312 |   - 'of_property_for_each_u32'
313 |   - 'pci_bus_for_each_resource'
314 |   - 'ping_portaddr_for_each_entry'
315 |   - 'plist_for_each'
316 |   - 'plist_for_each_continue'
317 |   - 'plist_for_each_entry'
318 |   - 'plist_for_each_entry_continue'
319 |   - 'plist_for_each_entry_safe'
320 |   - 'plist_for_each_safe'
321 |   - 'pnp_for_each_card'
322 |   - 'pnp_for_each_dev'
323 |   - 'protocol_for_each_card'
324 |   - 'protocol_for_each_dev'
325 |   - 'queue_for_each_hw_ctx'
326 |   - 'radix_tree_for_each_slot'
327 |   - 'radix_tree_for_each_tagged'
328 |   - 'rbtree_postorder_for_each_entry_safe'
329 |   - 'resource_list_for_each_entry'
330 |   - 'resource_list_for_each_entry_safe'
331 |   - 'rhl_for_each_entry_rcu'
332 |   - 'rhl_for_each_rcu'
333 |   - 'rht_for_each'
334 |   - 'rht_for_each_continue'
335 |   - 'rht_for_each_entry'
336 |   - 'rht_for_each_entry_continue'
337 |   - 'rht_for_each_entry_rcu'
338 |   - 'rht_for_each_entry_rcu_continue'
339 |   - 'rht_for_each_entry_safe'
340 |   - 'rht_for_each_rcu'
341 |   - 'rht_for_each_rcu_continue'
342 |   - '__rq_for_each_bio'
343 |   - 'rq_for_each_segment'
344 |   - 'scsi_for_each_prot_sg'
345 |   - 'scsi_for_each_sg'
346 |   - 'sctp_for_each_hentry'
347 |   - 'sctp_skb_for_each'
348 |   - 'shdma_for_each_chan'
349 |   - '__shost_for_each_device'
350 |   - 'shost_for_each_device'
351 |   - 'sk_for_each'
352 |   - 'sk_for_each_bound'
353 |   - 'sk_for_each_entry_offset_rcu'
354 |   - 'sk_for_each_from'
355 |   - 'sk_for_each_rcu'
356 |   - 'sk_for_each_safe'
357 |   - 'sk_nulls_for_each'
358 |   - 'sk_nulls_for_each_from'
359 |   - 'sk_nulls_for_each_rcu'
360 |   - 'snd_pcm_group_for_each_entry'
361 |   - 'snd_soc_dapm_widget_for_each_path'
362 |   - 'snd_soc_dapm_widget_for_each_path_safe'
363 |   - 'snd_soc_dapm_widget_for_each_sink_path'
364 |   - 'snd_soc_dapm_widget_for_each_source_path'
365 |   - 'tb_property_for_each'
366 |   - 'udp_portaddr_for_each_entry'
367 |   - 'udp_portaddr_for_each_entry_rcu'
368 |   - 'usb_hub_for_each_child'
369 |   - 'v4l2_device_for_each_subdev'
370 |   - 'v4l2_m2m_for_each_dst_buf'
371 |   - 'v4l2_m2m_for_each_dst_buf_safe'
372 |   - 'v4l2_m2m_for_each_src_buf'
373 |   - 'v4l2_m2m_for_each_src_buf_safe'
374 |   - 'zorro_for_each_dev'
375 | 
376 | #IncludeBlocks: Preserve # Unknown to clang-format-5.0
377 | IncludeCategories:
378 |   - Regex: '.*'
379 |     Priority: 1
380 | IncludeIsMainRegex: '(Test)?$'
381 | IndentCaseLabels: false
382 | #IndentPPDirectives: None # Unknown to clang-format-5.0
383 | IndentWidth: 4
384 | IndentWrappedFunctionNames: false
385 | JavaScriptQuotes: Leave
386 | JavaScriptWrapImports: true
387 | KeepEmptyLinesAtTheStartOfBlocks: false
388 | MacroBlockBegin: ''
389 | MacroBlockEnd: ''
390 | MaxEmptyLinesToKeep: 1
391 | NamespaceIndentation: All
392 | #ObjCBinPackProtocolList: Auto # Unknown to clang-format-5.0
393 | ObjCBlockIndentWidth: 4
394 | ObjCSpaceAfterProperty: true
395 | ObjCSpaceBeforeProtocolList: true
396 | 
397 | # Taken from git's rules
398 | #PenaltyBreakAssignment: 10 # Unknown to clang-format-4.0
399 | PenaltyBreakBeforeFirstCallParameter: 30
400 | PenaltyBreakComment: 10
401 | PenaltyBreakFirstLessLess: 0
402 | PenaltyBreakString: 10
403 | PenaltyExcessCharacter: 100
404 | PenaltyReturnTypeOnItsOwnLine: 60
405 | 
406 | PointerAlignment: Left
407 | ReflowComments: false
408 | SortIncludes: false
409 | #SortUsingDeclarations: false # Unknown to clang-format-4.0
410 | SpaceAfterCStyleCast: false
411 | SpaceAfterTemplateKeyword: true
412 | SpaceBeforeAssignmentOperators: true
413 | #SpaceBeforeCtorInitializerColon: true # Unknown to clang-format-5.0
414 | #SpaceBeforeInheritanceColon: true # Unknown to clang-format-5.0
415 | SpaceBeforeParens: ControlStatements
416 | #SpaceBeforeRangeBasedForLoopColon: true # Unknown to clang-format-5.0
417 | SpaceInEmptyParentheses: false
418 | SpacesBeforeTrailingComments: 1
419 | SpacesInAngles: false
420 | SpacesInContainerLiterals: false
421 | SpacesInCStyleCastParentheses: false
422 | SpacesInParentheses: false
423 | SpacesInSquareBrackets: false
424 | Standard: Cpp11
425 | TabWidth: 4
426 | UseTab: Never
427 | ...
428 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Set default behavior to automatically normalize line endings.
 3 | ###############################################################################
 4 | * text=auto
 5 | 
 6 | ###############################################################################
 7 | # Set default behavior for command prompt diff.
 8 | #
 9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs     diff=csharp
14 | 
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following 
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln       merge=binary
26 | #*.csproj    merge=binary
27 | #*.vbproj    merge=binary
28 | #*.vcxproj   merge=binary
29 | #*.vcproj    merge=binary
30 | #*.dbproj    merge=binary
31 | #*.fsproj    merge=binary
32 | #*.lsproj    merge=binary
33 | #*.wixproj   merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj   merge=binary
36 | #*.wwaproj   merge=binary
37 | 
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg   binary
44 | #*.png   binary
45 | #*.gif   binary
46 | 
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | # 
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the 
52 | # entries below.
53 | ###############################################################################
54 | #*.doc   diff=astextplain
55 | #*.DOC   diff=astextplain
56 | #*.docx  diff=astextplain
57 | #*.DOCX  diff=astextplain
58 | #*.dot   diff=astextplain
59 | #*.DOT   diff=astextplain
60 | #*.pdf   diff=astextplain
61 | #*.PDF   diff=astextplain
62 | #*.rtf   diff=astextplain
63 | #*.RTF   diff=astextplain
64 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | 
  4 | # User-specific files
  5 | *.suo
  6 | *.user
  7 | *.userosscache
  8 | *.sln.docstates
  9 | 
 10 | # User-specific files (MonoDevelop/Xamarin Studio)
 11 | *.userprefs
 12 | 
 13 | # Build results
 14 | [Dd]ebug/
 15 | [Dd]ebugPublic/
 16 | [Rr]elease/
 17 | [Rr]eleases/
 18 | x64/
 19 | x86/
 20 | bld/
 21 | [Bb]in/
 22 | [Oo]bj/
 23 | [Ll]og/
 24 | 
 25 | # Visual Studio 2015 cache/options directory
 26 | .vs/
 27 | # Uncomment if you have tasks that create the project's static files in wwwroot
 28 | #wwwroot/
 29 | 
 30 | # MSTest test Results
 31 | [Tt]est[Rr]esult*/
 32 | [Bb]uild[Ll]og.*
 33 | 
 34 | # NUNIT
 35 | *.VisualState.xml
 36 | TestResult.xml
 37 | 
 38 | # Build Results of an ATL Project
 39 | [Dd]ebugPS/
 40 | [Rr]eleasePS/
 41 | dlldata.c
 42 | 
 43 | # DNX
 44 | project.lock.json
 45 | project.fragment.lock.json
 46 | artifacts/
 47 | 
 48 | *_i.c
 49 | *_p.c
 50 | *_i.h
 51 | *.ilk
 52 | *.meta
 53 | *.obj
 54 | *.pch
 55 | *.pdb
 56 | *.pgc
 57 | *.pgd
 58 | *.rsp
 59 | *.sbr
 60 | *.tlb
 61 | *.tli
 62 | *.tlh
 63 | *.tmp
 64 | *.tmp_proj
 65 | *.log
 66 | *.vspscc
 67 | *.vssscc
 68 | .builds
 69 | *.pidb
 70 | *.svclog
 71 | *.scc
 72 | 
 73 | # Chutzpah Test files
 74 | _Chutzpah*
 75 | 
 76 | # Visual C++ cache files
 77 | ipch/
 78 | *.aps
 79 | *.ncb
 80 | *.opendb
 81 | *.opensdf
 82 | *.sdf
 83 | *.cachefile
 84 | *.VC.db
 85 | *.VC.VC.opendb
 86 | 
 87 | # Visual Studio profiler
 88 | *.psess
 89 | *.vsp
 90 | *.vspx
 91 | *.sap
 92 | 
 93 | # TFS 2012 Local Workspace
 94 | $tf/
 95 | 
 96 | # Guidance Automation Toolkit
 97 | *.gpState
 98 | 
 99 | # ReSharper is a .NET coding add-in
100 | _ReSharper*/
101 | *.[Rr]e[Ss]harper
102 | *.DotSettings.user
103 | 
104 | # JustCode is a .NET coding add-in
105 | .JustCode
106 | 
107 | # TeamCity is a build add-in
108 | _TeamCity*
109 | 
110 | # DotCover is a Code Coverage Tool
111 | *.dotCover
112 | 
113 | # NCrunch
114 | _NCrunch_*
115 | .*crunch*.local.xml
116 | nCrunchTemp_*
117 | 
118 | # MightyMoose
119 | *.mm.*
120 | AutoTest.Net/
121 | 
122 | # Web workbench (sass)
123 | .sass-cache/
124 | 
125 | # Installshield output folder
126 | [Ee]xpress/
127 | 
128 | # DocProject is a documentation generator add-in
129 | DocProject/buildhelp/
130 | DocProject/Help/*.HxT
131 | DocProject/Help/*.HxC
132 | DocProject/Help/*.hhc
133 | DocProject/Help/*.hhk
134 | DocProject/Help/*.hhp
135 | DocProject/Help/Html2
136 | DocProject/Help/html
137 | 
138 | # Click-Once directory
139 | publish/
140 | 
141 | # Publish Web Output
142 | *.[Pp]ublish.xml
143 | *.azurePubxml
144 | # TODO: Comment the next line if you want to checkin your web deploy settings
145 | # but database connection strings (with potential passwords) will be unencrypted
146 | #*.pubxml
147 | *.publishproj
148 | 
149 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
150 | # checkin your Azure Web App publish settings, but sensitive information contained
151 | # in these scripts will be unencrypted
152 | PublishScripts/
153 | 
154 | # NuGet Packages
155 | *.nupkg
156 | # The packages folder can be ignored because of Package Restore
157 | **/packages/*
158 | # except build/, which is used as an MSBuild target.
159 | !**/packages/build/
160 | # Uncomment if necessary however generally it will be regenerated when needed
161 | #!**/packages/repositories.config
162 | # NuGet v3's project.json files produces more ignoreable files
163 | *.nuget.props
164 | *.nuget.targets
165 | 
166 | # Microsoft Azure Build Output
167 | csx/
168 | *.build.csdef
169 | 
170 | # Microsoft Azure Emulator
171 | ecf/
172 | rcf/
173 | 
174 | # Windows Store app package directories and files
175 | AppPackages/
176 | BundleArtifacts/
177 | Package.StoreAssociation.xml
178 | _pkginfo.txt
179 | 
180 | # Visual Studio cache files
181 | # files ending in .cache can be ignored
182 | *.[Cc]ache
183 | # but keep track of directories ending in .cache
184 | !*.[Cc]ache/
185 | 
186 | # Others
187 | ClientBin/
188 | ~$*
189 | *~
190 | *.dbmdl
191 | *.dbproj.schemaview
192 | *.jfm
193 | *.pfx
194 | *.publishsettings
195 | node_modules/
196 | orleans.codegen.cs
197 | 
198 | # Since there are multiple workflows, uncomment next line to ignore bower_components
199 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
200 | #bower_components/
201 | 
202 | # RIA/Silverlight projects
203 | Generated_Code/
204 | 
205 | # Backup & report files from converting an old project file
206 | # to a newer Visual Studio version. Backup files are not needed,
207 | # because we have git ;-)
208 | _UpgradeReport_Files/
209 | Backup*/
210 | UpgradeLog*.XML
211 | UpgradeLog*.htm
212 | 
213 | # SQL Server files
214 | *.mdf
215 | *.ldf
216 | 
217 | # Business Intelligence projects
218 | *.rdl.data
219 | *.bim.layout
220 | *.bim_*.settings
221 | 
222 | # Microsoft Fakes
223 | FakesAssemblies/
224 | 
225 | # GhostDoc plugin setting file
226 | *.GhostDoc.xml
227 | 
228 | # Node.js Tools for Visual Studio
229 | .ntvs_analysis.dat
230 | 
231 | # Visual Studio 6 build log
232 | *.plg
233 | 
234 | # Visual Studio 6 workspace options file
235 | *.opt
236 | 
237 | # Visual Studio LightSwitch build output
238 | **/*.HTMLClient/GeneratedArtifacts
239 | **/*.DesktopClient/GeneratedArtifacts
240 | **/*.DesktopClient/ModelManifest.xml
241 | **/*.Server/GeneratedArtifacts
242 | **/*.Server/ModelManifest.xml
243 | _Pvt_Extensions
244 | 
245 | # Paket dependency manager
246 | .paket/paket.exe
247 | paket-files/
248 | 
249 | # FAKE - F# Make
250 | .fake/
251 | 
252 | # JetBrains Rider
253 | .idea/
254 | *.sln.iml
255 | 
256 | # CodeRush
257 | .cr/
258 | 
259 | # Python Tools for Visual Studio (PTVS)
260 | __pycache__/
261 | *.pyc
262 | /MatrixMult/matrixB.bin
263 | /MatrixMult/matrixA.bin
264 | /matrixB.bin
265 | /matrixAB.bin
266 | /matrixA.bin
267 | /MatrixMult/matrixAB.bin
268 | /MatrixMult/matrixAB-out.bin
269 | /Benchmarks/My Inspector Results - Benchmarks/My Inspector Results - Benchmarks.inspxeproj
270 | /matrixAB-out.bin
271 | /MatrixMult/My Advisor Results - MatrixMult
272 | /MatrixMult/My Amplifier Results - MatrixMult
273 | /Benchmarks/matrixB.bin
274 | /Benchmarks/matrixAB.bin
275 | /Benchmarks/matrixA.bin
276 | /Benchmarks/My Advisor Results - Benchmarks
277 | /Benchmarks/My Amplifier Results - Benchmarks
278 | /MatrixGenerator/My Amplifier Results - MatrixGenerator
279 | /MatrixMult/matrixB11000.bin
280 | /MatrixMult/matrixAB11000.bin
281 | /MatrixMult/matrixA11000.bin
282 | /MatrixMult/matrixB9000.bin
283 | /MatrixMult/matrixA9000.bin
284 | /MatrixMult/My Inspector Results - MatrixMult
285 | /MatrixMult/matrixB1000.bin
286 | /MatrixMult/matrixA1000.bin
287 | /MatrixMult/matrixBx.bin
288 | /MatrixMult/matrixAx.bin
289 | /MatrixMult/matrixABx.bin
290 | 


--------------------------------------------------------------------------------
/Benchmark1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talhasaruhan/cpp-matmul/e1ef1edf935d5af6d79de15b127d1e8ad13f284c/Benchmark1.png


--------------------------------------------------------------------------------
/Benchmarks/Benchmarks.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Debug|x64">
 13 |       <Configuration>Debug</Configuration>
 14 |       <Platform>x64</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <PropertyGroup Label="Globals">
 22 |     <VCProjectVersion>15.0</VCProjectVersion>
 23 |     <ProjectGuid>{5895928A-FD77-4426-9588-36399A75D082}</ProjectGuid>
 24 |     <RootNamespace>Benchmarks</RootNamespace>
 25 |     <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
 26 |   </PropertyGroup>
 27 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 28 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 29 |     <ConfigurationType>Application</ConfigurationType>
 30 |     <UseDebugLibraries>true</UseDebugLibraries>
 31 |     <PlatformToolset>v141</PlatformToolset>
 32 |     <CharacterSet>MultiByte</CharacterSet>
 33 |   </PropertyGroup>
 34 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 35 |     <ConfigurationType>Application</ConfigurationType>
 36 |     <UseDebugLibraries>false</UseDebugLibraries>
 37 |     <PlatformToolset>v141</PlatformToolset>
 38 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 39 |     <CharacterSet>MultiByte</CharacterSet>
 40 |   </PropertyGroup>
 41 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 42 |     <ConfigurationType>Application</ConfigurationType>
 43 |     <UseDebugLibraries>true</UseDebugLibraries>
 44 |     <PlatformToolset>v141</PlatformToolset>
 45 |     <CharacterSet>MultiByte</CharacterSet>
 46 |   </PropertyGroup>
 47 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 48 |     <ConfigurationType>Application</ConfigurationType>
 49 |     <UseDebugLibraries>false</UseDebugLibraries>
 50 |     <PlatformToolset>v141</PlatformToolset>
 51 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 52 |     <CharacterSet>MultiByte</CharacterSet>
 53 |     <UseIntelTBB>true</UseIntelTBB>
 54 |     <UseIntelMKL>Parallel</UseIntelMKL>
 55 |     <UseILP64Interfaces>true</UseILP64Interfaces>
 56 |   </PropertyGroup>
 57 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 58 |   <ImportGroup Label="ExtensionSettings">
 59 |   </ImportGroup>
 60 |   <ImportGroup Label="Shared">
 61 |   </ImportGroup>
 62 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 63 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 64 |   </ImportGroup>
 65 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 66 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 67 |   </ImportGroup>
 68 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 69 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 70 |   </ImportGroup>
 71 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 72 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 73 |   </ImportGroup>
 74 |   <PropertyGroup Label="UserMacros" />
 75 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 76 |     <ExecutablePath>$(ExecutablePath)</ExecutablePath>
 77 |     <SourcePath>$(SourcePath)</SourcePath>
 78 |   </PropertyGroup>
 79 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 80 |     <ClCompile>
 81 |       <WarningLevel>Level3</WarningLevel>
 82 |       <Optimization>MaxSpeed</Optimization>
 83 |       <FunctionLevelLinking>true</FunctionLevelLinking>
 84 |       <IntrinsicFunctions>true</IntrinsicFunctions>
 85 |       <SDLCheck>true</SDLCheck>
 86 |       <ConformanceMode>true</ConformanceMode>
 87 |       <AdditionalIncludeDirectories>C:\eigen;</AdditionalIncludeDirectories>
 88 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
 89 |       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
 90 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
 91 |       <FloatingPointModel>Fast</FloatingPointModel>
 92 |       <OpenMPSupport>true</OpenMPSupport>
 93 |       <AdditionalOptions> /DMKL_ILP64 -I"%MKLROOT%"\include %(AdditionalOptions)</AdditionalOptions>
 94 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 95 |       <OmitFramePointers>true</OmitFramePointers>
 96 |       <UseIntelOptimizedHeaders>true</UseIntelOptimizedHeaders>
 97 |       <RecognizeRestrictKeyword>true</RecognizeRestrictKeyword>
 98 |       <EnableMatrixMultiplyLibraryCall>No</EnableMatrixMultiplyLibraryCall>
 99 |       <ExceptionHandling>false</ExceptionHandling>
100 |       <FloatingPointExceptions>false</FloatingPointExceptions>
101 |     </ClCompile>
102 |     <Link>
103 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
104 |       <OptimizeReferences>true</OptimizeReferences>
105 |       <AdditionalDependencies>mkl_intel_ilp64.lib; mkl_tbb_thread.lib; mkl_core.lib; tbb.lib</AdditionalDependencies>
106 |       <AdditionalLibraryDirectories>C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2019.0.117\windows\mkl\lib\intel64_win;C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2019\windows\tbb\lib\intel64_win\vc14_uwp</AdditionalLibraryDirectories>
107 |       <AdditionalOptions> /DMKL_ILP64 -I"%MKLROOT%"\include %(AdditionalOptions)</AdditionalOptions>
108 |       <SubSystem>Console</SubSystem>
109 |     </Link>
110 |   </ItemDefinitionGroup>
111 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
112 |     <ClCompile>
113 |       <WarningLevel>Level3</WarningLevel>
114 |       <Optimization>Disabled</Optimization>
115 |       <SDLCheck>true</SDLCheck>
116 |       <ConformanceMode>true</ConformanceMode>
117 |     </ClCompile>
118 |   </ItemDefinitionGroup>
119 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
120 |     <ClCompile>
121 |       <WarningLevel>Level3</WarningLevel>
122 |       <Optimization>Disabled</Optimization>
123 |       <SDLCheck>true</SDLCheck>
124 |       <ConformanceMode>true</ConformanceMode>
125 |     </ClCompile>
126 |   </ItemDefinitionGroup>
127 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
128 |     <ClCompile>
129 |       <WarningLevel>Level3</WarningLevel>
130 |       <Optimization>MaxSpeed</Optimization>
131 |       <FunctionLevelLinking>true</FunctionLevelLinking>
132 |       <IntrinsicFunctions>true</IntrinsicFunctions>
133 |       <SDLCheck>true</SDLCheck>
134 |       <ConformanceMode>true</ConformanceMode>
135 |     </ClCompile>
136 |     <Link>
137 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
138 |       <OptimizeReferences>true</OptimizeReferences>
139 |     </Link>
140 |   </ItemDefinitionGroup>
141 |   <ItemGroup>
142 |     <ClCompile Include="EigenBenchmark.cpp" />
143 |     <ClCompile Include="IntrinASMDotBenchmark.cpp" />
144 |     <ClCompile Include="IntrinsicSumBenchmarks.cpp" />
145 |   </ItemGroup>
146 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
147 |   <ImportGroup Label="ExtensionTargets">
148 |   </ImportGroup>
149 | </Project>


--------------------------------------------------------------------------------
/Benchmarks/Benchmarks.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="EigenBenchmark.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |     <ClCompile Include="IntrinsicSumBenchmarks.cpp">
22 |       <Filter>Source Files</Filter>
23 |     </ClCompile>
24 |     <ClCompile Include="IntrinASMDotBenchmark.cpp">
25 |       <Filter>Source Files</Filter>
26 |     </ClCompile>
27 |   </ItemGroup>
28 | </Project>


--------------------------------------------------------------------------------
/Benchmarks/EigenBenchmark.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <string.h>
 3 | #include <thread>
 4 | #include <chrono>
 5 | #include <random>
 6 | #include <sstream>
 7 | 
 8 | #define EIGEN_USE_MKL_ALL
 9 | #include <Eigen\Dense>
10 | 
11 | using namespace std;
12 | using namespace Eigen;
13 | 
14 | int main(int argc, char* argv[])
15 | {
16 |     int K;
17 |     if (argc == 1) {
18 |         K = 10000;
19 |     } else if (argc == 2) {
20 |         /* 2 NxN */
21 |         K = atoi(argv[1]);
22 |         assert(K > 0);
23 |     }
24 | 
25 |     mkl_set_num_threads(12);
26 |     setNbThreads(12);
27 | 
28 |     MatrixXd matA = MatrixXd::Random(K, K);
29 |     MatrixXd matB = MatrixXd::Random(K, K);
30 | 
31 |     auto start = std::chrono::high_resolution_clock::now();
32 |     MatrixXd matC = matA * matB;
33 |     auto end = std::chrono::high_resolution_clock::now();
34 | 
35 |     std::cout
36 |         << "Matrix Multiplication: "
37 |         << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()
38 |         << " microseconds.\n";
39 | }
40 | 


--------------------------------------------------------------------------------
/Benchmarks/IntrinASMDotBenchmark.cpp:
--------------------------------------------------------------------------------
  1 | //#include <iostream>
  2 | //#include <array>
  3 | //#include <chrono>
  4 | //#include <string.h>
  5 | //#include <xmmintrin.h>
  6 | //#include <emmintrin.h>
  7 | //#include <immintrin.h>
  8 | //#include <stdlib.h>
  9 | //#include <intrin.h>
 10 | //#include <malloc.h>
 11 | //#include <cassert>
 12 | //
 13 | //using namespace std;
 14 | //
 15 | //#define AVX_ALIGNMENT 32
 16 | //
 17 | //float VecDotIntrinsicExplicit1(float* const a, float* const b, const unsigned N)
 18 | //{
 19 | //    float* vsum = (float*)aligned_alloc(8 * sizeof(float), AVX_ALIGNMENT);
 20 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
 21 | //
 22 | //    __m256 sum = _mm256_setzero_ps();
 23 | //    __m256 a1, a2, a3, a4, a5, a6, a7, a8;
 24 | //    __m256 b1, b2, b3, b4, b5, b6, b7, b8;
 25 | //
 26 | //    for (int i = 0; i<N; i += 64) {
 27 | //        a1 = _mm256_load_ps(&a[i + 0]);
 28 | //        a2 = _mm256_load_ps(&a[i + 8]);
 29 | //        a3 = _mm256_load_ps(&a[i + 16]);
 30 | //        a4 = _mm256_load_ps(&a[i + 24]);
 31 | //        a5 = _mm256_load_ps(&a[i + 32]);
 32 | //        a6 = _mm256_load_ps(&a[i + 40]);
 33 | //        a7 = _mm256_load_ps(&a[i + 48]);
 34 | //        a8 = _mm256_load_ps(&a[i + 56]);
 35 | //
 36 | //        b1 = _mm256_load_ps(&b[i + 0]);
 37 | //        b2 = _mm256_load_ps(&b[i + 8]);
 38 | //        b3 = _mm256_load_ps(&b[i + 16]);
 39 | //        b4 = _mm256_load_ps(&b[i + 24]);
 40 | //        b5 = _mm256_load_ps(&b[i + 32]);
 41 | //        b6 = _mm256_load_ps(&b[i + 40]);
 42 | //        b7 = _mm256_load_ps(&b[i + 48]);
 43 | //        b8 = _mm256_load_ps(&b[i + 56]);
 44 | //
 45 | //        a1 = _mm256_mul_ps(a1, b1);
 46 | //        a2 = _mm256_mul_ps(a2, b2);
 47 | //        a3 = _mm256_mul_ps(a3, b3);
 48 | //        a4 = _mm256_mul_ps(a4, b4);
 49 | //        a5 = _mm256_mul_ps(a5, b5);
 50 | //        a6 = _mm256_mul_ps(a6, b6);
 51 | //        a7 = _mm256_mul_ps(a7, b7);
 52 | //        a8 = _mm256_mul_ps(a8, b8);
 53 | //
 54 | //        a1 = _mm256_add_ps(a1, a2);
 55 | //        a3 = _mm256_add_ps(a3, a4);
 56 | //        a5 = _mm256_add_ps(a5, a6);
 57 | //        a7 = _mm256_add_ps(a7, a8);
 58 | //
 59 | //        a1 = _mm256_add_ps(a1, a3);
 60 | //        a5 = _mm256_add_ps(a5, a7);
 61 | //
 62 | //        a1 = _mm256_add_ps(a1, a5);
 63 | //
 64 | //        sum = _mm256_add_ps(sum, a1);
 65 | //    }
 66 | //
 67 | //    _mm256_store_ps(&vsum[0], sum);
 68 | //
 69 | //    float acc = 0;
 70 | //    for (int i = 0; i<8; ++i) {
 71 | //        acc += vsum[i];
 72 | //    }
 73 | //
 74 | //    return acc;
 75 | //}
 76 | //
 77 | //float VecDotIntrinsicExplicit2(float* const a, float* const b, const unsigned N)
 78 | //{
 79 | //    float* vsum = (float*)aligned_alloc(8 * sizeof(float), AVX_ALIGNMENT);
 80 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
 81 | //
 82 | //    __m256 a1, a2, a3, a4;
 83 | //    __m256 b1, b2, b3, b4;
 84 | //    __m256 c1 = _mm256_setzero_ps();
 85 | //    __m256 c2 = _mm256_setzero_ps();
 86 | //    __m256 c3 = _mm256_setzero_ps();
 87 | //    __m256 c4 = _mm256_setzero_ps();
 88 | //
 89 | //    for (int i = 0; i<N; i += 32) {
 90 | //        a1 = _mm256_load_ps(&a[i + 0]);
 91 | //        a2 = _mm256_load_ps(&a[i + 8]);
 92 | //        a3 = _mm256_load_ps(&a[i + 16]);
 93 | //        a4 = _mm256_load_ps(&a[i + 24]);
 94 | //
 95 | //        b1 = _mm256_load_ps(&b[i + 0]);
 96 | //        b2 = _mm256_load_ps(&b[i + 8]);
 97 | //        b3 = _mm256_load_ps(&b[i + 16]);
 98 | //        b4 = _mm256_load_ps(&b[i + 24]);
 99 | //
100 | //        a1 = _mm256_mul_ps(a1, b1);
101 | //        a2 = _mm256_mul_ps(a2, b2);
102 | //        a3 = _mm256_mul_ps(a3, b3);
103 | //        a4 = _mm256_mul_ps(a4, b4);
104 | //
105 | //        c1 = _mm256_add_ps(c1, a1);
106 | //        c2 = _mm256_add_ps(c2, a2);
107 | //        c3 = _mm256_add_ps(c3, a3);
108 | //        c4 = _mm256_add_ps(c4, a4);
109 | //    }
110 | //
111 | //    c1 = _mm256_add_ps(c1, c2);
112 | //    c3 = _mm256_add_ps(c3, c4);
113 | //
114 | //    c1 = _mm256_add_ps(c1, c3);
115 | //
116 | //    _mm256_store_ps(&vsum[0], c1);
117 | //
118 | //    float acc = 0;
119 | //    for (int i = 0; i<8; ++i) {
120 | //        acc += vsum[i];
121 | //    }
122 | //
123 | //    return acc;
124 | //}
125 | //
126 | //float VecDotIntrinsicExplicit3(float* const a, float* const b, const unsigned N)
127 | //{
128 | //    float* vsum = (float*)aligned_alloc(8 * sizeof(float), AVX_ALIGNMENT);
129 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
130 | //
131 | //    __m256 a1, a2, a3, a4, a5, a6;
132 | //    __m256 b1, b2, b3, b4, b5, b6;
133 | //    __m256 c1 = _mm256_setzero_ps();
134 | //    __m256 c2 = _mm256_setzero_ps();
135 | //    __m256 c3 = _mm256_setzero_ps();
136 | //    __m256 c4 = _mm256_setzero_ps();
137 | //
138 | //    int i = 0;
139 | //    for (; i<N - 48; i += 48) {
140 | //        a1 = _mm256_load_ps(&a[i + 0]);
141 | //        a2 = _mm256_load_ps(&a[i + 8]);
142 | //        a3 = _mm256_load_ps(&a[i + 16]);
143 | //        a4 = _mm256_load_ps(&a[i + 24]);
144 | //        a5 = _mm256_load_ps(&a[i + 32]);
145 | //        a6 = _mm256_load_ps(&a[i + 40]);
146 | //
147 | //        b1 = _mm256_load_ps(&b[i + 0]);
148 | //        b2 = _mm256_load_ps(&b[i + 8]);
149 | //        b3 = _mm256_load_ps(&b[i + 16]);
150 | //        b4 = _mm256_load_ps(&b[i + 24]);
151 | //        b5 = _mm256_load_ps(&b[i + 32]);
152 | //        b6 = _mm256_load_ps(&b[i + 40]);
153 | //
154 | //        a1 = _mm256_mul_ps(a1, b1);
155 | //        a2 = _mm256_mul_ps(a2, b2);
156 | //        a3 = _mm256_mul_ps(a3, b3);
157 | //        a4 = _mm256_mul_ps(a4, b4);
158 | //        a5 = _mm256_mul_ps(a5, b5);
159 | //        a6 = _mm256_mul_ps(a6, b6);
160 | //
161 | //        a1 = _mm256_add_ps(a1, a2);
162 | //        a3 = _mm256_add_ps(a3, a4);
163 | //        a5 = _mm256_add_ps(a5, a6);
164 | //
165 | //        c1 = _mm256_add_ps(c1, a1);
166 | //        c2 = _mm256_add_ps(c2, a3);
167 | //        c3 = _mm256_add_ps(c3, a5);
168 | //    }
169 | //    for (; i<N; i += 16) {
170 | //        a1 = _mm256_load_ps(&a[i + 0]);
171 | //        a2 = _mm256_load_ps(&a[i + 8]);
172 | //
173 | //        b1 = _mm256_load_ps(&b[i + 0]);
174 | //        b2 = _mm256_load_ps(&b[i + 8]);
175 | //
176 | //        a1 = _mm256_mul_ps(a1, b1);
177 | //        a2 = _mm256_mul_ps(a2, b2);
178 | //
179 | //        c1 = _mm256_add_ps(c1, a1);
180 | //        c2 = _mm256_add_ps(c2, a2);
181 | //    }
182 | //
183 | //    c1 = _mm256_add_ps(c1, c2);
184 | //    c1 = _mm256_add_ps(c1, c3);
185 | //
186 | //    _mm256_store_ps(&vsum[0], c1);
187 | //
188 | //    float acc = 0;
189 | //    for (int i = 0; i<8; ++i) {
190 | //        acc += vsum[i];
191 | //    }
192 | //
193 | //    return acc;
194 | //}
195 | //
196 | //
197 | //float VecDotASMExplicit1(float* const a, float* const b, const unsigned N)
198 | //{
199 | //    float* vsum = (float*)aligned_alloc(8 * sizeof(float), AVX_ALIGNMENT);
200 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
201 | //
202 | //    for (int i = 0; i<N; i += 64) {
203 | //        asm volatile("vmovaps ymm0, ymmword ptr [%0];" : : "r"(&a[i + 0]) : );
204 | //        asm volatile("vmovaps ymm1, ymmword ptr [%0];" : : "r"(&a[i + 8]) : );
205 | //        asm volatile("vmovaps ymm2, ymmword ptr [%0];" : : "r"(&a[i + 16]) : );
206 | //        asm volatile("vmovaps ymm3, ymmword ptr [%0];" : : "r"(&a[i + 24]) : );
207 | //        asm volatile("vmovaps ymm4, ymmword ptr [%0];" : : "r"(&a[i + 32]) : );
208 | //        asm volatile("vmovaps ymm5, ymmword ptr [%0];" : : "r"(&a[i + 40]) : );
209 | //        asm volatile("vmovaps ymm6, ymmword ptr [%0];" : : "r"(&a[i + 48]) : );
210 | //        asm volatile("vmovaps ymm7, ymmword ptr [%0];" : : "r"(&a[i + 56]) : );
211 | //
212 | //        asm volatile("vmovaps ymm8, ymmword ptr [%0];" : : "r"(&b[i + 0]) : );
213 | //        asm volatile("vmovaps ymm9, ymmword ptr [%0];" : : "r"(&b[i + 8]) : );
214 | //        asm volatile("vmovaps ymm10, ymmword ptr [%0];" : : "r"(&b[i + 16]) : );
215 | //        asm volatile("vmovaps ymm11, ymmword ptr [%0];" : : "r"(&b[i + 24]) : );
216 | //        asm volatile("vmovaps ymm12, ymmword ptr [%0];" : : "r"(&b[i + 32]) : );
217 | //        asm volatile("vmovaps ymm13, ymmword ptr [%0];" : : "r"(&b[i + 40]) : );
218 | //        asm volatile("vmovaps ymm14, ymmword ptr [%0];" : : "r"(&b[i + 48]) : );
219 | //        asm volatile("vmovaps ymm15, ymmword ptr [%0];" : : "r"(&b[i + 56]) : );
220 | //
221 | //        asm volatile("vmulps ymm0, ymm0, ymm8");
222 | //        asm volatile("vmulps ymm1, ymm1, ymm9");
223 | //        asm volatile("vmulps ymm2, ymm2, ymm10");
224 | //        asm volatile("vmulps ymm3, ymm3, ymm11");
225 | //        asm volatile("vmulps ymm4, ymm4, ymm12");
226 | //        asm volatile("vmulps ymm5, ymm5, ymm13");
227 | //        asm volatile("vmulps ymm6, ymm6, ymm14");
228 | //        asm volatile("vmulps ymm7, ymm7, ymm15");
229 | //
230 | //        asm volatile("vaddps ymm0, ymm0, ymm1");
231 | //        asm volatile("vaddps ymm2, ymm2, ymm3");
232 | //        asm volatile("vaddps ymm4, ymm4, ymm5");
233 | //        asm volatile("vaddps ymm6, ymm6, ymm7");
234 | //
235 | //        asm volatile("vaddps ymm0, ymm0, ymm2");
236 | //        asm volatile("vaddps ymm4, ymm4, ymm6");
237 | //
238 | //        asm volatile("vaddps ymm0, ymm0, ymm4");
239 | //
240 | //        asm volatile("vmovaps ymm15, ymmword ptr [%0];" : : "r"(vsum) : );
241 | //        asm volatile("vaddps ymm15, ymm15, ymm0");
242 | //        asm volatile("vmovaps ymmword ptr [%0], ymm15;" : : "r"(vsum) : );
243 | //    }
244 | //
245 | //    // asm volatilevolatile("vzeroall");
246 | //    _mm256_zeroall();
247 | //
248 | //    float acc = 0;
249 | //    for (int i = 0; i<8; ++i) {
250 | //        acc += vsum[i];
251 | //    }
252 | //
253 | //    return acc;
254 | //}
255 | //
256 | //float VecDotASMExplicit2(float* const a, float* const b, const unsigned N)
257 | //{
258 | //    float* vsum = (float*)aligned_alloc(8 * sizeof(float), AVX_ALIGNMENT);
259 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
260 | //
261 | //    for (int i = 0; i<N; i += 32) {
262 | //        asm volatile("vmovaps ymm0, ymmword ptr [%0];" : : "r"(&a[i + 0]) : );
263 | //        asm volatile("vmovaps ymm1, ymmword ptr [%0];" : : "r"(&a[i + 8]) : );
264 | //        asm volatile("vmovaps ymm2, ymmword ptr [%0];" : : "r"(&a[i + 16]) : );
265 | //        asm volatile("vmovaps ymm3, ymmword ptr [%0];" : : "r"(&a[i + 24]) : );
266 | //
267 | //        asm volatile("vmovaps ymm4, ymmword ptr [%0];" : : "r"(&b[i + 0]) : );
268 | //        asm volatile("vmovaps ymm5, ymmword ptr [%0];" : : "r"(&b[i + 8]) : );
269 | //        asm volatile("vmovaps ymm6, ymmword ptr [%0];" : : "r"(&b[i + 16]) : );
270 | //        asm volatile("vmovaps ymm7, ymmword ptr [%0];" : : "r"(&b[i + 24]) : );
271 | //
272 | //        asm volatile("vmulps ymm0, ymm0, ymm4");
273 | //        asm volatile("vmulps ymm1, ymm1, ymm5");
274 | //        asm volatile("vmulps ymm2, ymm2, ymm6");
275 | //        asm volatile("vmulps ymm3, ymm3, ymm7");
276 | //
277 | //        asm volatile("vaddps ymm8, ymm8, ymm0");
278 | //        asm volatile("vaddps ymm9, ymm9, ymm1");
279 | //        asm volatile("vaddps ymm10, ymm10, ymm2");
280 | //        asm volatile("vaddps ymm11, ymm11, ymm3");
281 | //    }
282 | //
283 | //
284 | //    asm volatile("vaddps ymm8, ymm8, ymm9");
285 | //    asm volatile("vaddps ymm10, ymm10, ymm11");
286 | //
287 | //    asm volatile("vaddps ymm8, ymm8, ymm10");
288 | //
289 | //    asm volatile("vmovaps ymmword ptr [%0], ymm8;" : : "r"(vsum) : );
290 | //
291 | //    // asm volatile("vzeroall");
292 | //    _mm256_zeroall();
293 | //
294 | //    float acc = 0;
295 | //    for (int i = 0; i<8; ++i) {
296 | //        acc += vsum[i];
297 | //    }
298 | //
299 | //    return acc;
300 | //}
301 | //
302 | //
303 | //void ILPSum() {
304 | //    const unsigned T = 1000;
305 | //    const unsigned K = 20;
306 | //    const unsigned N = 1 << K;
307 | //
308 | //    cout << N << "\n";
309 | //
310 | //    float* ar = (float*)aligned_alloc(N * sizeof(float), AVX_ALIGNMENT);
311 | //    float* ar2 = (float*)aligned_alloc(N * sizeof(float), AVX_ALIGNMENT);
312 | //
313 | //    for (int i = 0; i<N; ++i) {
314 | //        ar[i] = 1;
315 | //        ar2[i] = 2;
316 | //    }
317 | //
318 | //    float t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0;
319 | //
320 | //    /*****************************************************/
321 | //
322 | //    auto start = std::chrono::high_resolution_clock::now();
323 | //    for (int i = 0; i<T; ++i)
324 | //        t1 += VecDotASMExplicit2(ar, ar2, N);
325 | //    auto end = std::chrono::high_resolution_clock::now();
326 | //    std::cout << "Dot ASM volatile explicit 2: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds.\n";
327 | //
328 | //    /*****************************************************/
329 | //
330 | //    cout << t1 << endl;
331 | //}
332 | //
333 | ////int main() {
334 | ////    ILPSum();
335 | ////}


--------------------------------------------------------------------------------
/Benchmarks/IntrinsicSumBenchmarks.cpp:
--------------------------------------------------------------------------------
  1 | //#include <iostream>
  2 | //#include <array>
  3 | //#include <chrono>
  4 | //#include <string.h>
  5 | //#include <xmmintrin.h>
  6 | //#include <emmintrin.h>
  7 | //#include <immintrin.h>
  8 | //
  9 | //using namespace std;
 10 | //
 11 | //#define AVX_ALIGNMENT 32
 12 | //
 13 | ///* naive sum using intrinsics */
 14 | //float VecSumIntrinsicNaiveLoop(const float* const __restrict  c, const unsigned N)
 15 | //{
 16 | //    _declspec(align(32)) float vsum[8];
 17 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
 18 | //
 19 | //    __m256 sum = _mm256_setzero_ps();
 20 | //    __m256 x0, x1;
 21 | //
 22 | //    for (int i = 0; i<N; i += 16) {
 23 | //        x0 = _mm256_load_ps(&c[i]);
 24 | //        x1 = _mm256_load_ps(&c[i + 8]);
 25 | //        x0 = _mm256_add_ps(x0, x1);
 26 | //        sum = _mm256_add_ps(sum, x0);
 27 | //    }
 28 | //
 29 | //    _mm256_store_ps(&vsum[0], sum);
 30 | //
 31 | //    float acc = 0;
 32 | //    for (int i = 0; i<8; ++i) {
 33 | //        acc += vsum[i];
 34 | //    }
 35 | //
 36 | //    return acc;
 37 | //}
 38 | //
 39 | ///* 
 40 | //* Load 16x8f vectors, and sum them, hierarchically, two by two. 
 41 | //* Note that towards the end, we're giving up on Intruction Level Parallelism as ops get increasingly dependent on each other.
 42 | //*/
 43 | //float VecSumIntrinsicExplicit1(const float* const __restrict c, const unsigned N)
 44 | //{
 45 | //    _declspec(align(32)) float vsum[8];
 46 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
 47 | //
 48 | //    /* this will probably be written on stack by the compiler as we use all 16 registers */
 49 | //    __m256 sum = _mm256_setzero_ps();
 50 | //
 51 | //    __m256 c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16;
 52 | //
 53 | //    for (int i = 0; i<N; i += 128) {
 54 | //        c1 = _mm256_load_ps(&c[i + 0]);
 55 | //        c2 = _mm256_load_ps(&c[i + 8]);
 56 | //        c3 = _mm256_load_ps(&c[i + 16]);
 57 | //        c4 = _mm256_load_ps(&c[i + 24]);
 58 | //        c5 = _mm256_load_ps(&c[i + 32]);
 59 | //        c6 = _mm256_load_ps(&c[i + 40]);
 60 | //        c7 = _mm256_load_ps(&c[i + 48]);
 61 | //        c8 = _mm256_load_ps(&c[i + 56]);
 62 | //        c9 = _mm256_load_ps(&c[i + 64]);
 63 | //        c10 = _mm256_load_ps(&c[i + 72]);
 64 | //        c11 = _mm256_load_ps(&c[i + 80]);
 65 | //        c12 = _mm256_load_ps(&c[i + 88]);
 66 | //        c13 = _mm256_load_ps(&c[i + 96]);
 67 | //        c14 = _mm256_load_ps(&c[i + 104]);
 68 | //        c15 = _mm256_load_ps(&c[i + 112]);
 69 | //        c16 = _mm256_load_ps(&c[i + 120]);
 70 | //
 71 | //        c1 = _mm256_add_ps(c1, c2);
 72 | //        c3 = _mm256_add_ps(c3, c4);
 73 | //        c5 = _mm256_add_ps(c5, c6);
 74 | //        c7 = _mm256_add_ps(c7, c8);
 75 | //        c9 = _mm256_add_ps(c9, c10);
 76 | //        c11 = _mm256_add_ps(c11, c12);
 77 | //        c13 = _mm256_add_ps(c13, c12);
 78 | //        c15 = _mm256_add_ps(c15, c16);
 79 | //
 80 | //        c1 = _mm256_add_ps(c1, c3);
 81 | //        c5 = _mm256_add_ps(c5, c7);
 82 | //        c9 = _mm256_add_ps(c9, c11);
 83 | //        c13 = _mm256_add_ps(c13, c15);
 84 | //
 85 | //        c1 = _mm256_add_ps(c1, c5);
 86 | //        c9 = _mm256_add_ps(c9, c13);
 87 | //
 88 | //        c1 = _mm256_add_ps(c1, c9);
 89 | //
 90 | //        sum = _mm256_add_ps(sum, c1);
 91 | //    }
 92 | //
 93 | //    _mm256_store_ps(&vsum[0], sum);
 94 | //
 95 | //    float acc = 0;
 96 | //    for (int i = 0; i<8; ++i) {
 97 | //        acc += vsum[i];
 98 | //    }
 99 | //
100 | //    return acc;
101 | //}
102 | //
103 | ///*
104 | //* Load 8x8f vectors, accumulate into 4x8f vecs. The upside is that we don't have interdependencies.
105 | //* The downside is obvious, we only process 8 vecs at a time and there are effectively 4 wasted registers
106 | //*/
107 | //float VecSumIntrinsicExplicit2(const float* const __restrict c, const unsigned N)
108 | //{
109 | //    _declspec(align(32)) float vsum[8];
110 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
111 | //    __m256 sum = _mm256_setzero_ps();
112 | //
113 | //    __m256 c1 = _mm256_setzero_ps();
114 | //    __m256 c2 = _mm256_setzero_ps();
115 | //    __m256 c3 = _mm256_setzero_ps();
116 | //    __m256 c4 = _mm256_setzero_ps();
117 | //    __m256 c5 = _mm256_setzero_ps();
118 | //    __m256 c6 = _mm256_setzero_ps();
119 | //    __m256 c7 = _mm256_setzero_ps();
120 | //    __m256 c8 = _mm256_setzero_ps();
121 | //    __m256 c9 = _mm256_setzero_ps();
122 | //    __m256 c10 = _mm256_setzero_ps();
123 | //    __m256 c11 = _mm256_setzero_ps();
124 | //    __m256 c12 = _mm256_setzero_ps();
125 | //    __m256 c13 = _mm256_setzero_ps();
126 | //    __m256 c14 = _mm256_setzero_ps();
127 | //    __m256 c15 = _mm256_setzero_ps();
128 | //    __m256 c16 = _mm256_setzero_ps();
129 | //
130 | //    // keep sums at 4 registers
131 | //    for (int i = 0; i<N; i += 64) {
132 | //        c9 = _mm256_load_ps(&c[i + 0]);
133 | //        c10 = _mm256_load_ps(&c[i + 8]);
134 | //        c11 = _mm256_load_ps(&c[i + 16]);
135 | //        c12 = _mm256_load_ps(&c[i + 24]);
136 | //        c13 = _mm256_load_ps(&c[i + 32]);
137 | //        c14 = _mm256_load_ps(&c[i + 40]);
138 | //        c15 = _mm256_load_ps(&c[i + 48]);
139 | //        c16 = _mm256_load_ps(&c[i + 56]);
140 | //
141 | //        c5 = _mm256_add_ps(c9, c10);
142 | //        c6 = _mm256_add_ps(c11, c12);
143 | //        c7 = _mm256_add_ps(c13, c14);
144 | //        c8 = _mm256_add_ps(c15, c16);
145 | //
146 | //        c1 = _mm256_add_ps(c1, c5);
147 | //        c2 = _mm256_add_ps(c2, c6);
148 | //        c3 = _mm256_add_ps(c3, c7);
149 | //        c4 = _mm256_add_ps(c4, c8);
150 | //    }
151 | //
152 | //    c1 = _mm256_add_ps(c1, c2);
153 | //    c3 = _mm256_add_ps(c3, c4);
154 | //
155 | //    sum = _mm256_add_ps(c1, c3);
156 | //
157 | //    _mm256_store_ps(&vsum[0], sum);
158 | //
159 | //    float acc = 0;
160 | //    for (int i = 0; i<8; ++i) {
161 | //        acc += vsum[i];
162 | //    }
163 | //
164 | //    return acc;
165 | //}
166 | //
167 | ///*
168 | //* Spoiler: While I think this idea is cool, it'll only perform worse than the previous method.
169 | //*
170 | //* This is a different idea, like a rolling accumulator if that makes sense.
171 | //* We reduce-sum to 8 vectors to 4 in one iteration, in the next iteration, we sum them up again and accumulate into 2 vectors.
172 | //* The upside is that since we're working on independent data, we can take advantage of ILP.
173 | //*
174 | //* Imagine registers hypothetically divided as such:
175 | //* [ ymm0 | ymm1 ] [ ymm2 | ymm3 ] [ ymm4 | ymm5  | ymm6 | ymm7 ] [ ymm8:15 ]
176 | //* 
177 | //* in an iteration:
178 | //* reduce sum ymm4:7 to ymm0:1, then accumulate results onto ymm2:3
179 | //* load a1:8 to ymm8:15, reduce-sum to ymm4:7
180 | //*
181 | //*/
182 | //float VecSumIntrinsicExplicit3(const float* const __restrict c, const unsigned N)
183 | //{
184 | //    _declspec(align(32)) float vsum[8];
185 | //    for (int i = 0; i<8; ++i) vsum[i] = 0;
186 | //    __m256 sum = _mm256_setzero_ps();
187 | //
188 | //    __m256 c1 = _mm256_setzero_ps();
189 | //    __m256 c2 = _mm256_setzero_ps();
190 | //    __m256 c3 = _mm256_setzero_ps();
191 | //    __m256 c4 = _mm256_setzero_ps();
192 | //    __m256 c5 = _mm256_setzero_ps();
193 | //    __m256 c6 = _mm256_setzero_ps();
194 | //    __m256 c7 = _mm256_setzero_ps();
195 | //    __m256 c8 = _mm256_setzero_ps();
196 | //    __m256 c9 = _mm256_setzero_ps();
197 | //    __m256 c10 = _mm256_setzero_ps();
198 | //    __m256 c11 = _mm256_setzero_ps();
199 | //    __m256 c12 = _mm256_setzero_ps();
200 | //    __m256 c13 = _mm256_setzero_ps();
201 | //    __m256 c14 = _mm256_setzero_ps();
202 | //    __m256 c15 = _mm256_setzero_ps();
203 | //    __m256 c16 = _mm256_setzero_ps();
204 | //
205 | //    for (int i = 0; i<N; i += 64) {
206 | //        /* load new data */
207 | //        c9 = _mm256_load_ps(&c[i + 0]);
208 | //        c10 = _mm256_load_ps(&c[i + 8]);
209 | //        c11 = _mm256_load_ps(&c[i + 16]);
210 | //        c12 = _mm256_load_ps(&c[i + 24]);
211 | //        c13 = _mm256_load_ps(&c[i + 32]);
212 | //        c14 = _mm256_load_ps(&c[i + 40]);
213 | //        c15 = _mm256_load_ps(&c[i + 48]);
214 | //        c16 = _mm256_load_ps(&c[i + 56]);
215 | //
216 | //        /* reduce sum previous iterations 4 vecs into 2 */
217 | //        c1 = _mm256_add_ps(c5, c6);
218 | //        c2 = _mm256_add_ps(c7, c8);
219 | //
220 | //        /* dependent on the c1,c2 ops above, accumulate c1,c2 into c3,c4 */
221 | //        c3 = _mm256_add_ps(c3, c1);
222 | //        c4 = _mm256_add_ps(c4, c2);
223 | //
224 | //        /* reduce-sum new data into c5:8, 
225 | //        notice that these ops are not dependent on the 2 ops above, so they can be excecuted in parallel. */
226 | //        c5 = _mm256_add_ps(c9, c10);
227 | //        c6 = _mm256_add_ps(c11, c12);
228 | //        c7 = _mm256_add_ps(c13, c14);
229 | //        c8 = _mm256_add_ps(c15, c16);
230 | //    }
231 | //
232 | //    c1 = _mm256_add_ps(c5, c6);
233 | //    c2 = _mm256_add_ps(c7, c8);
234 | //    c3 = _mm256_add_ps(c3, c1);
235 | //    c4 = _mm256_add_ps(c4, c2);
236 | //    sum = _mm256_add_ps(c3, c4);
237 | //
238 | //    _mm256_store_ps(&vsum[0], sum);
239 | //
240 | //    float acc = 0;
241 | //    for (int i = 0; i<8; ++i) {
242 | //        acc += vsum[i];
243 | //    }
244 | //
245 | //    return acc;
246 | //}
247 | //
248 | ///* just an idea, sum the vec as if you'd sum nodes in a binary tree, bottom up, not expected to work well */
249 | //float VecSumIntrinsicBinary(float* const c, const unsigned N, const unsigned K)
250 | //{
251 | //    __m256 c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16;
252 | //
253 | //    for (int k = 0; k <= K - 7; ++k) {
254 | //        for (int i = 0; i<N / (1 << k); i += 128) {
255 | //            c1 = _mm256_load_ps(&c[i + 0]);
256 | //            c2 = _mm256_load_ps(&c[i + 8]);
257 | //            c3 = _mm256_load_ps(&c[i + 16]);
258 | //            c4 = _mm256_load_ps(&c[i + 24]);
259 | //            c5 = _mm256_load_ps(&c[i + 32]);
260 | //            c6 = _mm256_load_ps(&c[i + 40]);
261 | //            c7 = _mm256_load_ps(&c[i + 48]);
262 | //            c8 = _mm256_load_ps(&c[i + 56]);
263 | //            c9 = _mm256_load_ps(&c[i + 64]);
264 | //            c10 = _mm256_load_ps(&c[i + 72]);
265 | //            c11 = _mm256_load_ps(&c[i + 80]);
266 | //            c12 = _mm256_load_ps(&c[i + 88]);
267 | //            c13 = _mm256_load_ps(&c[i + 96]);
268 | //            c14 = _mm256_load_ps(&c[i + 104]);
269 | //            c15 = _mm256_load_ps(&c[i + 112]);
270 | //            c16 = _mm256_load_ps(&c[i + 120]);
271 | //
272 | //            c1 = _mm256_add_ps(c1, c2);
273 | //            c2 = _mm256_add_ps(c3, c4);
274 | //            c3 = _mm256_add_ps(c5, c6);
275 | //            c4 = _mm256_add_ps(c7, c8);
276 | //            c5 = _mm256_add_ps(c9, c10);
277 | //            c6 = _mm256_add_ps(c11, c12);
278 | //            c7 = _mm256_add_ps(c13, c12);
279 | //            c8 = _mm256_add_ps(c15, c16);
280 | //
281 | //            const unsigned j = i >> 1;
282 | //            _mm256_store_ps(&c[j + 0], c1);
283 | //            _mm256_store_ps(&c[j + 8], c2);
284 | //            _mm256_store_ps(&c[j + 16], c3);
285 | //            _mm256_store_ps(&c[j + 24], c4);
286 | //            _mm256_store_ps(&c[j + 32], c5);
287 | //            _mm256_store_ps(&c[j + 40], c6);
288 | //            _mm256_store_ps(&c[j + 48], c7);
289 | //            _mm256_store_ps(&c[j + 56], c8);
290 | //        }
291 | //    }
292 | //
293 | //    return VecSumIntrinsicNaiveLoop(c, 64);
294 | //}
295 | //
296 | ///* scalar sum */
297 | //float VecSumScalarAccumulate(const float* const __restrict c, const unsigned N) {
298 | //    /*
299 | //    * compiler optimizes this by keeping t in an xmm register
300 | //    * s.t at every iteration, we do 1 load and 1 add
301 | //    * but t <- add(t, ai) is obviously dependent on t
302 | //    * so there goes the ILP.
303 | //    */
304 | //
305 | //    float t = 0;
306 | //    for (int i = 0; i<N; ++i) {
307 | //        t += c[i];
308 | //    }
309 | //    return t;
310 | //}
311 | //
312 | ///* binary idea, scalar case. Note that this performs way better than above for some optimization levels. */
313 | //float VecSumScalarBinary(float* c, const unsigned N, const unsigned K) {
314 | //    /*
315 | //    * Note that this yields more instructions (2 loads, 1 add, 1 store)
316 | //    * but since we don't have data dependency between iterations,
317 | //    * we can fully utilize ILP.
318 | //    */
319 | //
320 | //    for (int k = 1; k <= K; ++k) {
321 | //        for (int i = 0; i<N / (1 << k); ++i) {
322 | //            c[i] = c[2 * i] + c[2 * i + 1];
323 | //        }
324 | //    }
325 | //
326 | //    return c[0];
327 | //}
328 | //
329 | //void ILPSum() {
330 | //    const unsigned Ts = 1, T = 1000;
331 | //    const unsigned K = 20;
332 | //    const unsigned N = 1 << K;
333 | //
334 | //    float* ar = (float*)_aligned_malloc(N * sizeof(float), AVX_ALIGNMENT);
335 | //    float* ar_cpy = (float*)_aligned_malloc(N * sizeof(float), AVX_ALIGNMENT);
336 | //
337 | //    for (int i = 0; i<N; ++i) {
338 | //        ar[i] = 0.005;
339 | //    }
340 | //
341 | //    float t1 = 0, t2 = 0, t3 = 0, t4 = 0, t5 = 0, t6 = 0, t7 = 0, t8 = 0, t9 = 0;
342 | //
343 | //    /*****************************************************/
344 | //
345 | //    auto start = std::chrono::high_resolution_clock::now();
346 | //    for (int i = 0; i<T; ++i)
347 | //        t1 += VecSumScalarAccumulate(ar, N);
348 | //    auto end = std::chrono::high_resolution_clock::now();
349 | //    std::cout << "C++ Accumulative sum: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds.\n";
350 | //
351 | //    /*****************************************************/
352 | //
353 | //    //memcpy(ar_cpy, ar, N * sizeof(float));
354 | //
355 | //    //start = std::chrono::high_resolution_clock::now();
356 | //    //t2 = VecSumScalarBinary(ar_cpy, N, K);
357 | //    //end = std::chrono::high_resolution_clock::now();
358 | //    //std::cout << "C++ Binary sum: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds.\n";
359 | //
360 | //    /*****************************************************/
361 | //
362 | //    start = std::chrono::high_resolution_clock::now();
363 | //    for (int i = 0; i<T; ++i)
364 | //        t3 += VecSumIntrinsicNaiveLoop(ar, N);
365 | //    end = std::chrono::high_resolution_clock::now();
366 | //    std::cout << "Intrinsic naive sum: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds.\n";
367 | //
368 | //    /*****************************************************/
369 | //
370 | //    start = std::chrono::high_resolution_clock::now();
371 | //    for (int i = 0; i<T; ++i)
372 | //        t4 += VecSumIntrinsicExplicit1(ar, N);
373 | //    end = std::chrono::high_resolution_clock::now();
374 | //    std::cout << "Intrinsic unrolled sum: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds.\n";
375 | //
376 | //    /*****************************************************/
377 | //
378 | //    start = std::chrono::high_resolution_clock::now();
379 | //    for (int i = 0; i<T; ++i)
380 | //        t5 += VecSumIntrinsicExplicit2(ar, N);
381 | //    end = std::chrono::high_resolution_clock::now();
382 | //    std::cout << "Intrinsic unrolled sum 2: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds.\n";
383 | //
384 | //    /*****************************************************/
385 | //
386 | //    start = std::chrono::high_resolution_clock::now();
387 | //    for (int i = 0; i<T; ++i)
388 | //        t6 += VecSumIntrinsicExplicit3(ar, N);
389 | //    end = std::chrono::high_resolution_clock::now();
390 | //    std::cout << "Intrinsic unrolled sum 3: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << " milliseconds.\n";
391 | //
392 | //    /*****************************************************/
393 | //
394 | //    cout << t1 << endl;
395 | //    cout << t3 << endl;
396 | //    cout << t4 << endl;
397 | //    cout << t5 << endl;
398 | //    cout << t6 << endl;
399 | //}
400 | //
401 | //int main() {
402 | //    ILPSum();
403 | //}


--------------------------------------------------------------------------------
/Benchmarks/NumpyBenchmark.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | 
 4 | n = 1000
 5 | a = np.random.randn(n, n)*50
 6 | 
 7 | start = time.time()
 8 | b = np.dot(a, a)
 9 | end = time.time()
10 | 
11 | print(end-start)


--------------------------------------------------------------------------------
/MatrixGenerator/MatrixGenerator.cpp:
--------------------------------------------------------------------------------
  1 | #define WIN32_LEAN_AND_MEAN
  2 | #include <Windows.h>
  3 | #include <chrono>
  4 | #include <sstream>
  5 | #include <iostream>
  6 | #include <fstream>
  7 | #include <cstdint>
  8 | #include <random>
  9 | #include <functional>
 10 | #include <cstdio>
 11 | #include <memory>
 12 | #include <mutex>
 13 | #include <thread>
 14 | #include <fstream>
 15 | #include <cstdint>
 16 | #include <random>
 17 | #include <functional>
 18 | #include <iostream>
 19 | #include <chrono>
 20 | #include <cstdlib>
 21 | #include <cassert>
 22 | 
 23 | #define AVX_ALIGN 32
 24 | 
 25 | typedef struct Mat
 26 | {
 27 | 	unsigned width;
 28 | 	unsigned height;
 29 | 	unsigned rowSpan;
 30 | 	float *mat;
 31 | } Mat;
 32 | 
 33 | template<typename Rand>
 34 | static void RandInitMat(Mat *m, Rand &r)
 35 | {
 36 | 	for(unsigned y=0; y<m->height; ++y)
 37 | 		for(unsigned x=0; x<m->width; ++x)
 38 | 			m->mat[y*m->rowSpan + x] = r();
 39 | }
 40 | 
 41 | const Mat LoadMat(const char * const filename) {
 42 |     Mat mat;
 43 |     uint32_t matSize;
 44 | 
 45 |     std::ifstream in(filename, std::ios::binary | std::ios::in);
 46 | 
 47 |     if (!in.is_open()) {
 48 |         std::cerr << "Err loading!\n";
 49 |         return {};
 50 |     }
 51 | 
 52 |     in.read((char*)&mat, 3 * sizeof(uint32_t));
 53 |     in.read((char*)&matSize, sizeof(uint32_t));
 54 |     in.seekg(12*sizeof(uint32_t), std::ios::cur);
 55 |     mat.mat = (float*)malloc(matSize);
 56 |     in.read((char*)mat.mat, matSize);
 57 | 
 58 |     in.close();
 59 | 
 60 |     return mat;
 61 | }
 62 | 
 63 | static void DumpMat(const char *filename, const Mat &m)
 64 | {
 65 | 	uint32_t header[16];
 66 | 	std::ofstream out(filename, std::ofstream::binary | std::ofstream::out);
 67 | 
 68 | 	header[0] = m.width;
 69 | 	header[1] = m.height;
 70 | 	header[2] = m.rowSpan;
 71 | 	header[3] = m.height * m.rowSpan * sizeof(float);
 72 | 
 73 | 	out.write(reinterpret_cast<const char*>(header), sizeof(header));
 74 | 	out.write(reinterpret_cast<const char*>(m.mat), header[3]);
 75 | 
 76 | 	out.close();
 77 | }
 78 | 
 79 | static unsigned RoundUpPwr2(unsigned val, unsigned pwr2)
 80 | {
 81 | 	return (val + (pwr2 - 1)) & (~(pwr2 - 1));
 82 | }
 83 | 
 84 | /* This function prints the given matrix to given std::ostream */
 85 | static void PrintMat(const Mat& mat, std::ostream& stream)
 86 | {
 87 |     stream << "w, h, rS: " << mat.width << " " << mat.height << "  " << mat.rowSpan
 88 |         << "\n";
 89 |     for (int i = 0; i < mat.height; i++) {
 90 |         for (int j = 0; j < mat.width; ++j) {
 91 |             stream << mat.mat[i * mat.rowSpan + j] << " ";
 92 |         }
 93 |         stream << "\n";
 94 |     }
 95 | }
 96 | 
 97 | 
 98 | /* Single threaded, do i need to multithread this as well? 
 99 | Honestly, I don't think it will have any significant effect. n^2 vs n^3 */
100 | __declspec(noalias) const Mat TransposeMat(const Mat& mat)
101 | {
102 |     const unsigned tRowSpan = RoundUpPwr2(mat.height, 64 / sizeof(float));
103 |     float* __restrict const tData =
104 |         (float*)_aligned_malloc(mat.width * tRowSpan * sizeof(float), AVX_ALIGN);
105 | 
106 |     Mat T{ mat.height, mat.width, tRowSpan, tData };
107 | 
108 |     // hah, the loops are truly interchangable as we encounter a cache miss either ways
109 |     for (int rowT = 0; rowT < T.height; ++rowT) {
110 |         for (int colT = 0; colT < T.width; ++colT) {
111 |             tData[rowT * tRowSpan + colT] = mat.mat[colT * mat.rowSpan + rowT];
112 |         }
113 |     }
114 | 
115 |     return T;
116 | }
117 | 
118 | const Mat ST_TransposedBMatMul(const Mat& matA, const Mat& matB)
119 | {
120 |     /* Now, I thought transposing B and then traversing it row order would help and it does!
121 |     * Also, note that, if we manually unrolled the loop here, compiler wouldn't vectorize the loop for some reason
122 |     * (1301: Loop stride is not +1.) is the exact compiler message. */
123 |     float* __restrict const matData =
124 |         (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
125 | 
126 |     Mat matC{ matB.width, matA.height, matB.rowSpan, matData };
127 | 
128 |     const Mat matBT = TransposeMat(matB);
129 |     for (int rowC = 0; rowC < matA.height; ++rowC) {
130 |         for (int colC = 0; colC < matB.width; ++colC) {
131 |             float accumulate = 0;
132 |             for (int pos = 0; pos < matA.width; ++pos) {
133 |                 accumulate += matA.mat[rowC * matA.rowSpan + pos] *
134 |                     matBT.mat[colC * matBT.rowSpan + pos];
135 |             }
136 |             matData[rowC * matB.rowSpan + colC] = accumulate;
137 |         }
138 |     }
139 | 
140 |     _aligned_free(matBT.mat);
141 | 
142 |     return matC;
143 | }
144 | 
145 | int _cdecl main(int argc, char *argv[])
146 | {
147 | 	static const unsigned ALIGN = 64;
148 | 	static const unsigned FLT_ALIGN = ALIGN / sizeof(float);
149 | 
150 | 	std::random_device rd;
151 | 	std::uniform_real_distribution<float> matValDist(-50.0f, 50.0f);
152 | 	auto matRand = std::bind(matValDist, std::ref(rd));
153 | 	Mat a, b;
154 |     std::string suffix;
155 | 
156 |     if (argc == 1) {
157 |         /* randomly generated */
158 | 	    std::uniform_int_distribution<unsigned> matSizeDist(100, 1000);
159 | 	    auto sizeRand = std::bind(matSizeDist, std::ref(rd));
160 |         a.width = sizeRand();
161 | 	    a.height = sizeRand();
162 | 	    a.rowSpan = RoundUpPwr2(a.width, FLT_ALIGN);
163 | 
164 |         b.width = sizeRand();
165 | 	    b.height = a.width;
166 | 
167 |         suffix = "";
168 |     }
169 |     else if (argc == 2) {
170 |         /* 2 NxN */
171 |         const int N = atoi(argv[1]);
172 |         assert(N > 0);
173 |         a.width = N;
174 |         a.height = N;
175 |         b.width = N;
176 |         b.height = N;
177 | 
178 |         suffix = "";
179 |     }
180 |     else if (argc == 3) {
181 |         /* 2 NxN */
182 |         const int N = atoi(argv[1]);
183 |         assert(N > 0);
184 |         a.width = N;
185 |         a.height = N;
186 |         b.width = N;
187 |         b.height= N;
188 |         
189 |         suffix = std::string(argv[2]);
190 |     }
191 |     else if (argc == 4) {
192 |         /* NxM, MxN */
193 |         const int N = atoi(argv[1]);
194 |         const int M = atoi(argv[2]);
195 |         assert(N > 0 && M > 0);
196 |         a.width = M;
197 |         a.height = N;
198 |         b.width = N;
199 |         b.height = M;
200 | 
201 |         suffix = std::string(argv[3]);
202 |     }
203 |     else if (argc == 5) {
204 |         /* NxM, MxK */
205 |         const int N = atoi(argv[1]);
206 |         const int M = atoi(argv[2]);
207 |         const int K = atoi(argv[3]);
208 |         assert(N > 0 && M > 0);
209 |         a.width = M;
210 |         a.height = N;
211 |         b.width = K;
212 |         b.height = M;    
213 | 
214 |         suffix = std::string(argv[4]);
215 |     }
216 |     else {
217 |         std::cerr << "Invalid arguments!\n";
218 |         return 2;
219 |     }
220 | 
221 | 
222 |     a.rowSpan = RoundUpPwr2(a.width, FLT_ALIGN);
223 |     b.rowSpan = RoundUpPwr2(b.width, FLT_ALIGN);
224 | 
225 | 	a.mat = new float[a.rowSpan*a.height];
226 | 	b.mat = new float[b.rowSpan*b.height];
227 | 	
228 | 	RandInitMat(&a, matRand);
229 | 	RandInitMat(&b, matRand);
230 | 
231 |     printf("a: [%d %d] | b: [%d %d]\n", a.width, a.height, b.width, b.height);
232 | 
233 |     auto start = std::chrono::high_resolution_clock::now();
234 |     const Mat c = ST_TransposedBMatMul(a, b);
235 |     auto end = std::chrono::high_resolution_clock::now();
236 |     std::cout << "Generation w/ tranposed mult. took: " 
237 |         << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count() 
238 |         << " microseconds.\n";
239 | 
240 |     DumpMat(("matrixA" + suffix + ".bin").c_str(), a);
241 |     DumpMat(("matrixB" + suffix + ".bin").c_str(), b);
242 |     DumpMat(("matrixAB" + suffix + ".bin").c_str(), c);
243 | 
244 | 	delete[] a.mat;
245 | 	delete[] b.mat;
246 |     _aligned_free(c.mat);
247 | 
248 | 	return 0;
249 | }
250 | 


--------------------------------------------------------------------------------
/MatrixGenerator/MatrixGenerator.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Debug|x64">
 13 |       <Configuration>Debug</Configuration>
 14 |       <Platform>x64</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <ItemGroup>
 22 |     <ClCompile Include="MatrixGenerator.cpp" />
 23 |   </ItemGroup>
 24 |   <PropertyGroup Label="Globals">
 25 |     <VCProjectVersion>15.0</VCProjectVersion>
 26 |     <ProjectGuid>{C6A23610-8F92-418E-8BC6-2CEFA194CE78}</ProjectGuid>
 27 |     <RootNamespace>MatrixGenerator</RootNamespace>
 28 |     <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
 29 |   </PropertyGroup>
 30 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 31 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 32 |     <ConfigurationType>Application</ConfigurationType>
 33 |     <UseDebugLibraries>true</UseDebugLibraries>
 34 |     <PlatformToolset>v141</PlatformToolset>
 35 |     <CharacterSet>MultiByte</CharacterSet>
 36 |   </PropertyGroup>
 37 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 38 |     <ConfigurationType>Application</ConfigurationType>
 39 |     <UseDebugLibraries>false</UseDebugLibraries>
 40 |     <PlatformToolset>v141</PlatformToolset>
 41 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 42 |     <CharacterSet>MultiByte</CharacterSet>
 43 |   </PropertyGroup>
 44 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 45 |     <ConfigurationType>Application</ConfigurationType>
 46 |     <UseDebugLibraries>true</UseDebugLibraries>
 47 |     <PlatformToolset>v141</PlatformToolset>
 48 |     <CharacterSet>MultiByte</CharacterSet>
 49 |   </PropertyGroup>
 50 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 51 |     <ConfigurationType>Application</ConfigurationType>
 52 |     <UseDebugLibraries>false</UseDebugLibraries>
 53 |     <PlatformToolset>v141</PlatformToolset>
 54 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 55 |     <CharacterSet>MultiByte</CharacterSet>
 56 |   </PropertyGroup>
 57 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 58 |   <ImportGroup Label="ExtensionSettings">
 59 |   </ImportGroup>
 60 |   <ImportGroup Label="Shared">
 61 |   </ImportGroup>
 62 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 63 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 64 |   </ImportGroup>
 65 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 66 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 67 |   </ImportGroup>
 68 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 69 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 70 |   </ImportGroup>
 71 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 72 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 73 |   </ImportGroup>
 74 |   <PropertyGroup Label="UserMacros" />
 75 |   <PropertyGroup />
 76 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 77 |     <ClCompile>
 78 |       <WarningLevel>Level3</WarningLevel>
 79 |       <Optimization>Disabled</Optimization>
 80 |       <SDLCheck>true</SDLCheck>
 81 |       <ConformanceMode>true</ConformanceMode>
 82 |     </ClCompile>
 83 |     <Link>
 84 |       <SubSystem>Console</SubSystem>
 85 |     </Link>
 86 |   </ItemDefinitionGroup>
 87 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 88 |     <ClCompile>
 89 |       <WarningLevel>Level3</WarningLevel>
 90 |       <Optimization>Disabled</Optimization>
 91 |       <SDLCheck>true</SDLCheck>
 92 |       <ConformanceMode>true</ConformanceMode>
 93 |     </ClCompile>
 94 |   </ItemDefinitionGroup>
 95 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 96 |     <ClCompile>
 97 |       <WarningLevel>Level3</WarningLevel>
 98 |       <Optimization>MaxSpeed</Optimization>
 99 |       <FunctionLevelLinking>true</FunctionLevelLinking>
100 |       <IntrinsicFunctions>true</IntrinsicFunctions>
101 |       <SDLCheck>true</SDLCheck>
102 |       <ConformanceMode>true</ConformanceMode>
103 |     </ClCompile>
104 |     <Link>
105 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
106 |       <OptimizeReferences>true</OptimizeReferences>
107 |     </Link>
108 |   </ItemDefinitionGroup>
109 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
110 |     <ClCompile>
111 |       <WarningLevel>Level3</WarningLevel>
112 |       <Optimization>MaxSpeed</Optimization>
113 |       <FunctionLevelLinking>true</FunctionLevelLinking>
114 |       <IntrinsicFunctions>true</IntrinsicFunctions>
115 |       <SDLCheck>true</SDLCheck>
116 |       <ConformanceMode>true</ConformanceMode>
117 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
118 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
119 |       <FloatingPointModel>Fast</FloatingPointModel>
120 |       <ExceptionHandling>false</ExceptionHandling>
121 |       <BufferSecurityCheck>false</BufferSecurityCheck>
122 |       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
123 |       <FloatingPointExceptions>false</FloatingPointExceptions>
124 |       <OmitFramePointers>false</OmitFramePointers>
125 |     </ClCompile>
126 |     <Link>
127 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
128 |       <OptimizeReferences>true</OptimizeReferences>
129 |     </Link>
130 |   </ItemDefinitionGroup>
131 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
132 |   <ImportGroup Label="ExtensionTargets">
133 |   </ImportGroup>
134 | </Project>


--------------------------------------------------------------------------------
/MatrixGenerator/MatrixGenerator.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="MatrixGenerator.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |   </ItemGroup>
22 | </Project>


--------------------------------------------------------------------------------
/MatrixMult.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 15
 4 | VisualStudioVersion = 15.0.27428.2015
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MatrixMult", "MatrixMult\MatrixMult.vcxproj", "{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}"
 7 | EndProject
 8 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MatrixMulTester", "MatrixMulTester\MatrixMulTester.vcxproj", "{0417B0D4-F0BF-4218-945C-C139C9498728}"
 9 | EndProject
10 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MatrixGenerator", "MatrixGenerator\MatrixGenerator.vcxproj", "{C6A23610-8F92-418E-8BC6-2CEFA194CE78}"
11 | EndProject
12 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Benchmarks", "Benchmarks\Benchmarks.vcxproj", "{5895928A-FD77-4426-9588-36399A75D082}"
13 | EndProject
14 | Global
15 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
16 | 		Debug|x64 = Debug|x64
17 | 		Debug|x86 = Debug|x86
18 | 		Release|x64 = Release|x64
19 | 		Release|x86 = Release|x86
20 | 	EndGlobalSection
21 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
22 | 		{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Debug|x64.ActiveCfg = Debug|x64
23 | 		{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Debug|x64.Build.0 = Debug|x64
24 | 		{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Debug|x86.ActiveCfg = Debug|Win32
25 | 		{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Debug|x86.Build.0 = Debug|Win32
26 | 		{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Release|x64.ActiveCfg = Release|x64
27 | 		{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Release|x64.Build.0 = Release|x64
28 | 		{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Release|x86.ActiveCfg = Release|Win32
29 | 		{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}.Release|x86.Build.0 = Release|Win32
30 | 		{0417B0D4-F0BF-4218-945C-C139C9498728}.Debug|x64.ActiveCfg = Debug|x64
31 | 		{0417B0D4-F0BF-4218-945C-C139C9498728}.Debug|x64.Build.0 = Debug|x64
32 | 		{0417B0D4-F0BF-4218-945C-C139C9498728}.Debug|x86.ActiveCfg = Debug|Win32
33 | 		{0417B0D4-F0BF-4218-945C-C139C9498728}.Debug|x86.Build.0 = Debug|Win32
34 | 		{0417B0D4-F0BF-4218-945C-C139C9498728}.Release|x64.ActiveCfg = Release|x64
35 | 		{0417B0D4-F0BF-4218-945C-C139C9498728}.Release|x64.Build.0 = Release|x64
36 | 		{0417B0D4-F0BF-4218-945C-C139C9498728}.Release|x86.ActiveCfg = Release|Win32
37 | 		{0417B0D4-F0BF-4218-945C-C139C9498728}.Release|x86.Build.0 = Release|Win32
38 | 		{C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Debug|x64.ActiveCfg = Debug|x64
39 | 		{C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Debug|x64.Build.0 = Debug|x64
40 | 		{C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Debug|x86.ActiveCfg = Debug|Win32
41 | 		{C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Debug|x86.Build.0 = Debug|Win32
42 | 		{C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Release|x64.ActiveCfg = Release|x64
43 | 		{C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Release|x64.Build.0 = Release|x64
44 | 		{C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Release|x86.ActiveCfg = Release|Win32
45 | 		{C6A23610-8F92-418E-8BC6-2CEFA194CE78}.Release|x86.Build.0 = Release|Win32
46 | 		{5895928A-FD77-4426-9588-36399A75D082}.Debug|x64.ActiveCfg = Debug|x64
47 | 		{5895928A-FD77-4426-9588-36399A75D082}.Debug|x64.Build.0 = Debug|x64
48 | 		{5895928A-FD77-4426-9588-36399A75D082}.Debug|x86.ActiveCfg = Debug|Win32
49 | 		{5895928A-FD77-4426-9588-36399A75D082}.Debug|x86.Build.0 = Debug|Win32
50 | 		{5895928A-FD77-4426-9588-36399A75D082}.Release|x64.ActiveCfg = Release|x64
51 | 		{5895928A-FD77-4426-9588-36399A75D082}.Release|x64.Build.0 = Release|x64
52 | 		{5895928A-FD77-4426-9588-36399A75D082}.Release|x86.ActiveCfg = Release|Win32
53 | 		{5895928A-FD77-4426-9588-36399A75D082}.Release|x86.Build.0 = Release|Win32
54 | 	EndGlobalSection
55 | 	GlobalSection(SolutionProperties) = preSolution
56 | 		HideSolutionNode = FALSE
57 | 	EndGlobalSection
58 | 	GlobalSection(ExtensibilityGlobals) = postSolution
59 | 		SolutionGuid = {D568E00C-A8ED-41CB-B719-B116D29D421F}
60 | 	EndGlobalSection
61 | 	GlobalSection(Performance) = preSolution
62 | 		HasPerformanceSessions = true
63 | 	EndGlobalSection
64 | EndGlobal
65 | 


--------------------------------------------------------------------------------
/MatrixMult/CPUUtil.cpp:
--------------------------------------------------------------------------------
  1 | #include "CPUUtil.h"
  2 | #include <cstdio>
  3 | #include <cstdint>
  4 | 
  5 | namespace CPUUtil
  6 | {
  7 |     namespace
  8 |     {
  9 |         static int logicalProcInfoCached = 0;
 10 |         static unsigned numHWCores, numLogicalProcessors;
 11 |         static ULONG_PTR* physLogicalProcessorMap = NULL;
 12 | 
 13 |         void PrintSysLPInfoArr(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION* const sysLPInf,
 14 |                                const DWORD& retLen)
 15 |         {
 16 |             unsigned numPhysicalCores = 0;
 17 |             for (int i = 0; i * sizeof(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= retLen;
 18 |                  ++i) {
 19 |                 if (sysLPInf[i].Relationship != RelationProcessorCore)
 20 |                     continue;
 21 | 
 22 |                 printf(
 23 |                   "PHYSICAL CPU[%d]\n\t_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX:\n",
 24 |                   numPhysicalCores);
 25 |                 printf("\t\tProcessorMask:%s\n",
 26 |                        BitmaskToStr(sysLPInf[i].ProcessorMask));
 27 |                 printf("\t\tRelationship:%u | RelationProcessorCore\n",
 28 |                        (uint8_t)sysLPInf[i].Relationship);
 29 |                 printf("\t\tProcessorCore:\n");
 30 |                 printf("\t\t\tFlags(HT?):%d\n",
 31 |                        (uint8_t)sysLPInf[i].ProcessorCore.Flags);
 32 |                 ++numPhysicalCores;
 33 |             }
 34 |         }
 35 | 
 36 |         int TestPrintCPUCores()
 37 |         {
 38 |             const unsigned N = 30;
 39 |             _SYSTEM_LOGICAL_PROCESSOR_INFORMATION sysLPInf[N];
 40 |             DWORD retLen = N * sizeof(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
 41 |             LOGICAL_PROCESSOR_RELATIONSHIP lpRel = RelationProcessorCore;
 42 | 
 43 |             BOOL retCode = GetLogicalProcessorInformation(&sysLPInf[0], &retLen);
 44 | 
 45 |             if (!retCode) {
 46 |                 DWORD errCode = GetLastError();
 47 |                 printf("ERR: %d\n", errCode);
 48 |                 if (errCode == ERROR_INSUFFICIENT_BUFFER) {
 49 |                     printf("Buffer is not large enough! Buffer length required: %d\n",
 50 |                            retLen);
 51 |                 } else {
 52 |                     printf("CHECK MSDN SYSTEM ERROR CODES LIST.\n");
 53 |                 }
 54 |                 return errCode;
 55 |             }
 56 | 
 57 |             PrintSysLPInfoArr(sysLPInf, retLen);
 58 | 
 59 |             return 0;
 60 |         }
 61 | 
 62 |         template <typename T>
 63 |         int NumSetBits(T n) {
 64 |             int count = 0;
 65 |             while (n) {
 66 |                 count += (n & 1) > 0 ? 1 : 0;
 67 |                 n >>= 1;
 68 |             }
 69 |             return count;
 70 |         }
 71 | 
 72 |         DWORD _GetSysLPMap(unsigned& numHWCores)
 73 |         {
 74 |             // These assumptions should never fail on desktop
 75 |             const unsigned N = 48, M = 48;
 76 | 
 77 |             _SYSTEM_LOGICAL_PROCESSOR_INFORMATION sysLPInf[N];
 78 |             DWORD retLen = N * sizeof(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
 79 |             LOGICAL_PROCESSOR_RELATIONSHIP lpRel = RelationProcessorCore;
 80 | 
 81 |             static BOOL retCode = GetLogicalProcessorInformation(&sysLPInf[0], &retLen);
 82 | 
 83 |             if (!retCode) {
 84 |                 return GetLastError();
 85 |             }
 86 | 
 87 |             ULONG_PTR* const lMap = (ULONG_PTR*)malloc(M * sizeof(ULONG_PTR));
 88 | 
 89 |             numHWCores = 0;
 90 |             for (int i = 0; i * sizeof(_SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= retLen;
 91 |                  ++i) {
 92 |                 if (sysLPInf[i].Relationship != RelationProcessorCore)
 93 |                     continue;
 94 | 
 95 |                 ULONG_PTR logicalProcessorMask = sysLPInf[i].ProcessorMask;
 96 |                 lMap[numHWCores++] = logicalProcessorMask;
 97 |                 numLogicalProcessors += NumSetBits(logicalProcessorMask);
 98 |             }
 99 | 
100 |             physLogicalProcessorMap = (ULONG_PTR*)malloc(numHWCores * sizeof(ULONG_PTR));
101 |             memcpy(physLogicalProcessorMap, lMap, numHWCores * sizeof(ULONG_PTR));
102 |             free(lMap);
103 | 
104 |             return 0;
105 |         }
106 |     } // private namespace
107 | 
108 |     const char* BitmaskToStr(WORD bitmask)
109 |     {
110 |         const unsigned N = sizeof(WORD) * 8;
111 |         char* const str = new char[N + 1];
112 |         str[N] = 0;
113 |         for (int i = 0; i < N; ++i) {
114 |             str[N - i - 1] = '0' + ((bitmask)&1);
115 |             bitmask >>= 1;
116 |         }
117 |         return str;
118 |     }
119 | 
120 |     int GetNumHWCores()
121 |     {
122 |         if (!logicalProcInfoCached) {
123 |             DWORD retCode = _GetSysLPMap(numHWCores);
124 |             if (!retCode)
125 |                 logicalProcInfoCached = 1;
126 |             else
127 |                 return -1;
128 |         }
129 |         return numHWCores;
130 |     }
131 | 
132 |     int GetNumLogicalProcessors() {
133 |         if (!logicalProcInfoCached) {
134 |             DWORD retCode = _GetSysLPMap(numHWCores);
135 |             if (!retCode)
136 |                 logicalProcInfoCached = 1;
137 |             else
138 |                 return -1;
139 |         }
140 |         return numLogicalProcessors;
141 |     }
142 | 
143 |     int GetProcessorMask(unsigned n, ULONG_PTR& mask)
144 |     {
145 |         if (!logicalProcInfoCached) {
146 |             DWORD retCode = _GetSysLPMap(numHWCores);
147 |             if (!retCode)
148 |                 logicalProcInfoCached = 1;
149 |             else
150 |                 return retCode;
151 |         }
152 | 
153 |         if (n >= numHWCores)
154 |             return -1;
155 | 
156 |         mask = physLogicalProcessorMap[n];
157 | 
158 |         return 0;
159 |     }
160 | 
161 |     /* Returns decimal value for a 32 bit mask at compile time, [i:j] set to 1, rest are 0. */
162 |     constexpr int GenerateMask(int i, int j)
163 |     {
164 |         if (i > j)
165 |             return (1 << (i + 1)) - (1 << j);
166 |         else
167 |             return (1 << (j + 1)) - (1 << i);
168 |     }
169 | 
170 |     void GetCacheInfo(int* dCaches, int& iCache)
171 |     {
172 |         /*
173 |         * From Intel's Processor Identification CPUID Instruction Notes:
174 |         * EAX := 0x04, ECX := (0, 1, 2 .. until EAX[4:0]==0)
175 |         * cpuid(memaddr, n, k) sets eax to n, ecx to k,
176 |         * writes EAX, EBX, ECX, and EDX to memaddr[0:4] respectively.
177 |         * Cache size in bytes = (Ways + 1) * (Partitions + 1)
178 |         *                                  * (Line size + 1) * (Sets + 1)
179 |         *                     = (EBX[31:22]+1) * (EBX[21:12]+1)
180 |         *                                      * (EBX[11:0]+1) * (ECX+1)
181 |         * For now, this function assumes we're on a modern Intel CPU
182 |         * So we have L1,2,3 data caches and first level instruction cache
183 |         */
184 | 
185 |         int cpui[4];
186 | 
187 |         for (int i = 0, dc = 0; i < 4; ++i) {
188 |             __cpuidex(cpui, 4, i);
189 |             int sz = (((cpui[1] & GenerateMask(31, 22)) >> 22) + 1) *
190 |                      (((cpui[1] & GenerateMask(21, 12)) >> 12) + 1) *
191 |                      ((cpui[1] & GenerateMask(11, 0)) + 1) * (cpui[2] + 1);
192 |             int cacheType = (cpui[0] & 31);
193 |             if (cacheType == 1 || cacheType == 3) {
194 |                 dCaches[dc++] = sz;
195 |             } else if (cacheType == 2) {
196 |                 iCache = sz;
197 |             }
198 |         }
199 |     }
200 | 
201 |     int GetCacheLineSize()
202 |     {
203 |         /*
204 |         * From Intel's Processor Identification CPUID Instruction Notes:
205 |         * Executing CPUID with EAX=1, fills EAX, EBX, ECX, EDX
206 |         * EBX[15:8] : CLFLUSHSIZE, val*8 = cache line size
207 |         */
208 |         int cpui[4];
209 |         __cpuid(cpui, 1);
210 |         return (cpui[1] & GenerateMask(15, 8)) >> (8 - 3);
211 |     }
212 | 
213 |     int GetHTTStatus() {
214 |         int cpui[4];
215 |         __cpuid(cpui, 1);
216 |         return ((cpui[3] & (1<<28)) >> 28) ? 1 : 0;
217 |     }
218 | 
219 |     int GetSIMDSupport() {
220 |         int cpui[4];
221 |         __cpuid(cpui, 1);
222 |         int fma = (cpui[2] & (1 << 12)) >> 12;
223 |         int avx = (cpui[2] & (1 << 28)) >> 28;
224 |         return fma & avx;
225 |     }
226 | 
227 | }; // namespace CPUUtil
228 | 


--------------------------------------------------------------------------------
/MatrixMult/CPUUtil.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #define WIN32_LEAN_AND_MEAN
 3 | #include <Windows.h>
 4 | #include <cassert>
 5 | #include <intrin.h>
 6 | 
 7 | namespace CPUUtil
 8 | {
 9 |     /* Utility, convert given bitmask to const char* */
10 |     const char* BitmaskToStr(WORD bitmask);
11 | 
12 |     /* Get number of physical processors on the runtime system */
13 |     int GetNumHWCores();
14 | 
15 |     /* Get number of logical processors on the runtime system */
16 |     int GetNumLogicalProcessors();
17 | 
18 |     /* Get the logical processor mask corresponding to the Nth hardware core */
19 |     int GetProcessorMask(unsigned n, ULONG_PTR& mask);
20 | 
21 |     /* Fill dCaches with L1,2,3 data cache sizes, 
22 |      * and iCache with L1 dedicated instruction cache size. */
23 |     void GetCacheInfo(int* dCaches, int& iCache);
24 | 
25 |     /* Query cache line size on the current system. */
26 |     int GetCacheLineSize();
27 | 
28 |     /* Query whether or not the runtime system supports HTT */
29 |     int GetHTTStatus();
30 | 
31 |     /* Query if the runtime system supports AVX and FMA instruction sets. */
32 |     int GetSIMDSupport();
33 | 
34 | }; // namespace CPUUtil
35 | 


--------------------------------------------------------------------------------
/MatrixMult/MatrixMul.cpp:
--------------------------------------------------------------------------------
   1 | #define WIN32_LEAN_AND_MEAN
   2 | #include <Windows.h>
   3 | #include <chrono>
   4 | #include <sstream>
   5 | #include <iostream>
   6 | #include <fstream>
   7 | #include <cstdint>
   8 | #include <random>
   9 | #include <functional>
  10 | #include <cstdio>
  11 | #include <memory>
  12 | #include <mutex>
  13 | #include <thread>
  14 | #include <numeric>
  15 | #include <xmmintrin.h>
  16 | #include <emmintrin.h>
  17 | #include <immintrin.h>
  18 | #include "ThreadPool.h"
  19 | 
  20 | /* Define for AVX alignment requirements */
  21 | #define AVX_ALIGN 32
  22 | 
  23 | /* Define CPU related variables, actual values will be queried on runtime. */
  24 | int CPUInfoQueried = 0;
  25 | int L2Size = 256 * 1024;
  26 | int L3Size = 12 * 1024 * 1024;
  27 | int cacheLineSz = 64;
  28 | int numHWCores = 6;
  29 | 
  30 | /* Prefetching switches, if multiple MatMul operations are intended to run in parallel,
  31 |  * individual mutexes should be created for each one. */
  32 | constexpr int doL3Prefetch = 0;
  33 | constexpr int doL12Prefetch = 0;
  34 | int prefetched[1024][1024];
  35 | std::mutex prefetchMutex;
  36 | 
  37 | /* Matrix structure */
  38 | typedef struct Mat {
  39 |     unsigned width;
  40 |     unsigned height;
  41 |     unsigned rowSpan;
  42 |     /* guarantee that mat will not be aliased (__restrict),
  43 |     no need for two matrices to point at sama data */
  44 |     float* __restrict mat;
  45 | } Mat;
  46 | 
  47 | /* 
  48 |  * This struct holds the information for multiple levels of block sizes.
  49 |  * It's used to keep function parameters short and readable
  50 |  * Constraints on block sizes:
  51 |  * L2BlockX % 3 == L2BlockY % 4 == 0,
  52 |  * L3BlockX % 2 == L3BlockY % 2 == 0,
  53 |  * (L3BlockX / 2) % L2BlockX == 0  
  54 |  */
  55 | typedef struct MMBlockInfo {
  56 |     const unsigned L3BlockX, L3BlockY;
  57 |     const unsigned L2BlockX, L2BlockY;
  58 |     const unsigned issuedBlockSzX, issuedBlockSzY;
  59 | } MMBlockInfo;
  60 | 
  61 | /* Load a previously saved matrix from disk */
  62 | const Mat LoadMat(const char* const filename)
  63 | {
  64 |     Mat mat;
  65 |     uint32_t matSize;
  66 | 
  67 |     std::ifstream in(filename, std::ios::binary | std::ios::in);
  68 | 
  69 |     if (!in.is_open()) {
  70 |         std::cout << "Err loading!\n";
  71 |         in.close();
  72 |         return {0, 0, 0, NULL};
  73 |     }
  74 | 
  75 |     in.read((char*)&mat, 3 * sizeof(uint32_t));
  76 |     in.read((char*)&matSize, sizeof(uint32_t));
  77 |     in.seekg(12 * sizeof(uint32_t), std::ios::cur);
  78 |     mat.mat = (float*)_aligned_malloc(matSize, AVX_ALIGN);
  79 |     in.read((char*)mat.mat, matSize);
  80 | 
  81 |     in.close();
  82 | 
  83 |     return mat;
  84 | }
  85 | 
  86 | /* Dump the given matrix to the disk. */
  87 | static void DumpMat(const char* filename, const Mat& m)
  88 | {
  89 |     uint32_t header[16];
  90 |     std::ofstream out(filename, std::ofstream::binary | std::ofstream::out);
  91 | 
  92 |     header[0] = m.width;
  93 |     header[1] = m.height;
  94 |     header[2] = m.rowSpan;
  95 |     header[3] = m.height * m.rowSpan * sizeof(float);
  96 | 
  97 |     out.write(reinterpret_cast<const char*>(header), sizeof(header));
  98 |     out.write(reinterpret_cast<const char*>(m.mat), header[3]);
  99 | 
 100 |     out.close();
 101 | }
 102 | 
 103 | /* Deallocate matrix data */
 104 | void FreeMat(Mat& mat)
 105 | {
 106 |     if (!mat.mat)
 107 |         return;
 108 |     _aligned_free(mat.mat);
 109 |     mat.mat = NULL;
 110 | }
 111 | void FreeMat(const Mat& mat)
 112 | {
 113 |     if (!mat.mat)
 114 |         return;
 115 |     _aligned_free(mat.mat);
 116 | }
 117 | 
 118 | /* Round a given number to the nearest multiple of K,
 119 | * where K is a parameter and is a power of 2 */
 120 | static unsigned RoundUpPwr2(unsigned val, unsigned pwr2)
 121 | {
 122 |     return (val + (pwr2 - 1)) & (~(pwr2 - 1));
 123 | }
 124 | 
 125 | /* Compute the transpose of a given matrix.
 126 |  * A singlethreaded implementation without block tiling. */
 127 | __declspec(noalias) const Mat TransposeMat(const Mat& mat)
 128 | {
 129 |     const unsigned tRowSpan = RoundUpPwr2(mat.height, 64 / sizeof(float));
 130 |     float* __restrict const tData =
 131 |       (float*)_aligned_malloc(mat.width * tRowSpan * sizeof(float), AVX_ALIGN);
 132 | 
 133 |     Mat T{mat.height, mat.width, tRowSpan, tData};
 134 | 
 135 |     // the loops are truly interchangable as we encounter a cache miss either ways
 136 |     for (int rowT = 0; rowT < T.height; ++rowT) {
 137 |         for (int colT = 0; colT < T.width; ++colT) {
 138 |             tData[rowT * tRowSpan + colT] = mat.mat[colT * mat.rowSpan + rowT];
 139 |         }
 140 |     }
 141 | 
 142 |     return T;
 143 | }
 144 | 
 145 | /* Print the given matrix to given std::ostream */
 146 | static void PrintMat(const Mat& mat, std::ostream& stream)
 147 | {
 148 |     stream << "w, h, rS: " << mat.width << " " << mat.height << "  " << mat.rowSpan
 149 |            << "\n";
 150 |     for (int i = 0; i < mat.height; i++) {
 151 |         for (int j = 0; j < mat.width; ++j) {
 152 |             stream << mat.mat[i * mat.rowSpan + j] << " ";
 153 |         }
 154 |         stream << "\n";
 155 |     }
 156 | }
 157 | 
 158 | /**************** Naive, initial implementations ****************/
 159 | 
 160 | /* Naive MatMul */
 161 | const Mat ST_NaiveMatMul(const Mat& matA, const Mat& matB)
 162 | {
 163 |     /* First : naive solution with but with some tricks to make compiler (MSVC) behave
 164 |      * Note that, in this case, manually unrolling the loop helps
 165 |      * as the compiler can't auto-vectorize non-contagious memory access */
 166 |     float* __restrict const matData =
 167 |       (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
 168 | 
 169 |     Mat matC{matB.width, matA.height, matB.rowSpan, matData};
 170 | 
 171 |     for (int rowC = 0; rowC < matA.height; ++rowC) {
 172 |         for (int colC = 0; colC < matB.width; ++colC) {
 173 |             /* an independent, local accumulator. */
 174 |             float accumulate = 0;
 175 |             int pos = 0;
 176 |             /* manual unrolling IS helpful in this case */
 177 |             for (; pos < matA.width - 4; pos += 4) {
 178 |                 accumulate += matA.mat[rowC * matA.rowSpan + pos] *
 179 |                                 matB.mat[pos * matB.rowSpan + colC] +
 180 |                               matA.mat[rowC * matA.rowSpan + pos + 1] *
 181 |                                 matB.mat[(pos + 1) * matB.rowSpan + colC] +
 182 |                               matA.mat[rowC * matA.rowSpan + pos + 2] *
 183 |                                 matB.mat[(pos + 2) * matB.rowSpan + colC] +
 184 |                               matA.mat[rowC * matA.rowSpan + pos + 3] *
 185 |                                 matB.mat[(pos + 3) * matB.rowSpan + colC];
 186 |             }
 187 |             for (; pos < matA.width; ++pos) {
 188 |                 accumulate += matA.mat[rowC * matA.rowSpan + pos] *
 189 |                               matB.mat[pos * matB.rowSpan + colC];
 190 |             }
 191 |             matData[rowC * matB.rowSpan + colC] = accumulate;
 192 |         }
 193 |     }
 194 | 
 195 |     return matC;
 196 | }
 197 | 
 198 | /* MatMul with transposed B for improved cache behavior. */
 199 | const Mat ST_TransposedBMatMul(const Mat& matA, const Mat& matB)
 200 | {
 201 |     /* 
 202 |      * Now, transposing B and then traversing it row order seemed promising!
 203 |      * Also, note that, if we manually unrolled the loop here, 
 204 |      * compiler wouldn't vectorize the loop, 
 205 |      * so we keep it simple and let MSVC auto vectorize this.
 206 |      */
 207 |     float* __restrict const matData =
 208 |       (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
 209 | 
 210 |     Mat matC{matB.width, matA.height, matB.rowSpan, matData};
 211 | 
 212 |     const Mat matBT = TransposeMat(matB);
 213 |     for (int rowC = 0; rowC < matA.height; ++rowC) {
 214 |         for (int colC = 0; colC < matB.width; ++colC) {
 215 |             float accumulate = 0;
 216 |             for (int pos = 0; pos < matA.width; ++pos) {
 217 |                 accumulate += matA.mat[rowC * matA.rowSpan + pos] *
 218 |                               matBT.mat[colC * matBT.rowSpan + pos];
 219 |             }
 220 |             matData[rowC * matB.rowSpan + colC] = accumulate;
 221 |         }
 222 |     }
 223 | 
 224 |     _aligned_free(matBT.mat);
 225 | 
 226 |     return matC;
 227 | }
 228 | 
 229 | /* 
 230 |  * MatMul with a different traversal order. 
 231 |  * Instead of linearly running thru whole rows of output matrix C, 
 232 |  * calculate blocks of a certain size at a time. 
 233 |  */
 234 | const Mat ST_BlockMult(const Mat& matA, const Mat& matB)
 235 | {
 236 |     /* Now, once we fetch column col from B, we use these cached values
 237 |     * to populate C(row, col:col+8), Any more than that,
 238 |     * and we lose the old cached values. But notice that,
 239 |     * the C(row+1, col:col+8) uses the exact same columns.
 240 |     * So instead of traversing in row order, we could do blocks!
 241 |     * Notice that I'm using transposed B,
 242 |     * That's because MSVC refuses to vectorize the loop with
 243 |     * non-contagious memory access.
 244 |     * So even though the floats themselves will be in the cache,
 245 |     * we won't have SIMD, which kills the performance.
 246 |     *
 247 |     * Also, I had to assign offsets to temporary constants,
 248 |     * because otherwise MSVC can't auto-vectorize. */
 249 |     float* __restrict const matData =
 250 |       (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
 251 | 
 252 |     Mat matC{matB.width, matA.height, matB.rowSpan, matData};
 253 | 
 254 |     const unsigned blockX = 16, blockY = 16;
 255 | 
 256 |     const Mat matBT = TransposeMat(matB);
 257 | 
 258 |     int rowC = 0;
 259 |     for (; rowC < matA.height - blockY; rowC += blockY) {
 260 |         int colC = 0;
 261 |         for (; colC < matB.width - blockX; colC += blockX) {
 262 |             for (int blockRow = 0; blockRow < blockY; ++blockRow) {
 263 |                 for (int blockCol = 0; blockCol < blockX; ++blockCol) {
 264 |                     const unsigned r = rowC + blockRow;
 265 |                     const unsigned c = colC + blockCol;
 266 |                     const unsigned matAoffset = r * matA.rowSpan;
 267 |                     const unsigned matBoffset = c * matBT.rowSpan;
 268 | 
 269 |                     float accumulate = 0;
 270 |                     for (int pos = 0; pos < matA.width; ++pos) {
 271 |                         accumulate +=
 272 |                           matA.mat[matAoffset + pos] * matBT.mat[matBoffset + pos];
 273 |                     }
 274 |                     matData[r * matB.rowSpan + c] = accumulate;
 275 |                 }
 276 |             }
 277 |         }
 278 |         for (int blockRow = 0; blockRow < blockY; ++blockRow) {
 279 |             for (int c = colC; c < matB.width; ++c) {
 280 |                 const unsigned r = rowC + blockRow;
 281 |                 const unsigned matAoffset = r * matA.rowSpan;
 282 |                 const unsigned matBoffset = c * matBT.rowSpan;
 283 |                 float accumulate = 0;
 284 |                 for (int pos = 0; pos < matA.width; ++pos) {
 285 |                     accumulate +=
 286 |                       matA.mat[matAoffset + pos] * matBT.mat[matBoffset + pos];
 287 |                 }
 288 |                 matData[r * matB.rowSpan + c] = accumulate;
 289 |             }
 290 |         }
 291 |     }
 292 |     for (; rowC < matA.height; ++rowC) {
 293 |         for (int colC = 0; colC < matB.width; ++colC) {
 294 |             const unsigned matAoffset = rowC * matA.rowSpan;
 295 |             const unsigned matBoffset = colC * matBT.rowSpan;
 296 |             float accumulate = 0;
 297 |             for (int pos = 0; pos < matA.width; ++pos) {
 298 |                 accumulate += matA.mat[matAoffset + pos] * matBT.mat[matBoffset + pos];
 299 |             }
 300 |             matData[rowC * matB.rowSpan + colC] = accumulate;
 301 |         }
 302 |     }
 303 | 
 304 |     _aligned_free(matBT.mat);
 305 | 
 306 |     return matC;
 307 | }
 308 | 
 309 | /************** ~~Naive, initial implementations~~ **************/
 310 | 
 311 | /* Declerations for helper functions for the final implementation */
 312 | 
 313 | __declspec(noalias) void MMHelper_MultAnyBlocks(float* __restrict const matData,
 314 |                                                 const unsigned rowSpan, const Mat& matA,
 315 |                                                 const Mat& matBT, const unsigned colC,
 316 |                                                 const unsigned rowC, const int blockX,
 317 |                                                 const int blockY,
 318 |                                                 const MMBlockInfo& mmBlockInfo);
 319 | 
 320 | __declspec(noalias) void MMHelper_MultL2Blocks(float* __restrict const matData,
 321 |                                                const unsigned rowSpan, const Mat& matA,
 322 |                                                const Mat& matBT, const unsigned col,
 323 |                                                const unsigned row,
 324 |                                                const unsigned L2BlockX,
 325 |                                                const unsigned L2BlockY);
 326 | 
 327 | __declspec(noalias) void MMHelper_MultFullBlocks(float* __restrict const matData,
 328 |                                                  const unsigned rowSpan,
 329 |                                                  const Mat& matA, const Mat& matBT,
 330 |                                                  const unsigned colC,
 331 |                                                  const unsigned rowC,
 332 |                                                  const MMBlockInfo& mmBlockInfo);
 333 | 
 334 | /* Declarations for helper functions that handle NxM blocks */
 335 | 
 336 | __declspec(noalias) void MMHelper_Mult4x3Blocks(float* __restrict const matData,
 337 |                                                 const unsigned rowSpan, const Mat& matA,
 338 |                                                 const Mat& matBT, const unsigned col,
 339 |                                                 const unsigned row);
 340 | __declspec(noalias) void MMHelper_Mult4x1Blocks(float* __restrict const matData,
 341 |                                                 const unsigned rowSpan, const Mat& matA,
 342 |                                                 const Mat& matBT, const unsigned col,
 343 |                                                 const unsigned row);
 344 | __declspec(noalias) void MMHelper_Mult1x3Blocks(float* __restrict const matData,
 345 |                                                 const unsigned rowSpan, const Mat& matA,
 346 |                                                 const Mat& matBT, const unsigned col,
 347 |                                                 const unsigned row);
 348 | __declspec(noalias) void MMHelper_Mult1x1Blocks(float* __restrict const matData,
 349 |                                                 const unsigned rowSpan, const Mat& matA,
 350 |                                                 const Mat& matBT, const unsigned col,
 351 |                                                 const unsigned row);
 352 | 
 353 | /* 
 354 |  * Helper function for computing a block out of the output matrix C.
 355 |  * This function is used for the residues at the edges 
 356 |  * after the majority of the matrix is computed as KxK sized blocks.
 357 |  * (t,l,b,r)->(row, col, row+blockY, col+blockX). 
 358 |  */
 359 | __declspec(noalias) void MMHelper_MultAnyBlocks(float* __restrict const matData,
 360 |                                                 const unsigned rowSpan, const Mat& matA,
 361 |                                                 const Mat& matBT, const unsigned colC,
 362 |                                                 const unsigned rowC, const int blockX,
 363 |                                                 const int blockY,
 364 |                                                 const MMBlockInfo& mmBlockInfo)
 365 | {
 366 |     /* if no work to be done, exit */
 367 |     if (blockX <= 0 || blockY <= 0)
 368 |         return;
 369 | 
 370 |     /* shorthand for some parameters */
 371 |     const unsigned L2BlockX = mmBlockInfo.L2BlockX, L2BlockY = mmBlockInfo.L2BlockY,
 372 |                    L3BlockX = mmBlockInfo.L3BlockX, L3BlockY = mmBlockInfo.L3BlockY;
 373 | 
 374 |     int blockRowC = rowC;
 375 |     /* handle full L2Y sized rows */
 376 |     for (; blockRowC <= rowC + blockY - L2BlockY; blockRowC += L2BlockY) {
 377 |         int blockColC = colC;
 378 |         /* handle (L2X x L2Y) blocks */
 379 |         for (; blockColC <= colC + blockX - L2BlockX; blockColC += L2BlockX) {
 380 |             MMHelper_MultL2Blocks(matData, rowSpan, matA, matBT, blockColC, blockRowC,
 381 |                                   L2BlockX, L2BlockY);
 382 |         }
 383 |         /* handle the remaining columns, (w<L2X, h=L2Y) */
 384 |         for (int blockRow = blockRowC; blockRow < blockRowC + L2BlockY; blockRow += 4) {
 385 |             int blockCol = blockColC;
 386 |             if ((colC + blockX - blockColC) > 4) {
 387 |                 for (; blockCol <= colC + blockX - 3; blockCol += 3) {
 388 |                     MMHelper_Mult4x3Blocks(matData, rowSpan, matA, matBT, blockCol,
 389 |                                            blockRow);
 390 |                 }
 391 |             }
 392 |             for (; blockCol < colC + blockX; ++blockCol) {
 393 |                 MMHelper_Mult4x1Blocks(matData, rowSpan, matA, matBT, blockCol,
 394 |                                        blockRow);
 395 |             }
 396 |         }
 397 |     }
 398 |     /* handle rest of the rows, h<L2Y, h%4=0 */
 399 |     for (; blockRowC <= rowC + blockY - 4; blockRowC += 4) {
 400 |         int blockColC = colC;
 401 |         /* handle (L2X x h<L2Y), h%4==0 blocks */
 402 |         for (; blockColC <= colC + blockX - L2BlockX; blockColC += L2BlockX) {
 403 |             for (int blockCol = 0; blockCol < L2BlockX; blockCol += 3) {
 404 |                 MMHelper_Mult4x3Blocks(matData, rowSpan, matA, matBT,
 405 |                                        blockColC + blockCol, blockRowC);
 406 |             }
 407 |         }
 408 |         /* handle remanining columns (w<L2X x h<L2Y), h%4==0 */
 409 |         for (; blockColC < colC + blockX; ++blockColC) {
 410 |             MMHelper_Mult4x1Blocks(matData, rowSpan, matA, matBT, blockColC, blockRowC);
 411 |         }
 412 |     }
 413 |     /* handle the very last row, h < 4 */
 414 |     for (; blockRowC < rowC + blockY; ++blockRowC) {
 415 |         int blockColC = colC;
 416 |         /* handle (L2X x h<3) blocks */
 417 |         for (; blockColC <= colC + blockX - L2BlockX; blockColC += L2BlockX) {
 418 |             for (int blockCol = 0; blockCol < L2BlockX; blockCol += 3) {
 419 |                 MMHelper_Mult1x3Blocks(matData, rowSpan, matA, matBT,
 420 |                                        blockColC + blockCol, blockRowC);
 421 |             }
 422 |         }
 423 |         /* handle remanining columns (w<L2X x h<3) */
 424 |         for (; blockColC < colC + blockX; ++blockColC) {
 425 |             MMHelper_Mult1x1Blocks(matData, rowSpan, matA, matBT, blockColC, blockRowC);
 426 |         }
 427 |     }
 428 | }
 429 | 
 430 | /* Calculates the dot product corresponding to a single entry in matrix C. */
 431 | __declspec(noalias) void MMHelper_Mult1x1Blocks(float* __restrict const matData,
 432 |                                                 const unsigned rowSpan, const Mat& matA,
 433 |                                                 const Mat& matBT, const unsigned col,
 434 |                                                 const unsigned row)
 435 | {
 436 |     /* scalar accumulator */
 437 |     __declspec(align(32)) float fps[8];
 438 |     __declspec(align(32)) float accumulate;
 439 | 
 440 |     const unsigned matAoffset = row * matA.rowSpan;
 441 |     const unsigned matBToffset = col * matBT.rowSpan;
 442 | 
 443 |     /* SIMD accumulators */
 444 |     __m256 a1, a2, b1, b2;
 445 |     __m256 c1 = _mm256_setzero_ps();
 446 |     __m256 c2 = _mm256_setzero_ps();
 447 | 
 448 |     /* handle 1 x 1 rows, 2x8f vectors at a time
 449 |     * <-------- A.w ------->
 450 |     * [---- [a1] [a2] ---- ]
 451 |     * [---- [b1] [b2] ---- ]
 452 |     */
 453 | 
 454 |     for (int pos = 0; pos < matA.width; pos += 16) {
 455 |         a1 = _mm256_load_ps(&matA.mat[matAoffset + pos]);
 456 |         a2 = _mm256_load_ps(&matA.mat[matAoffset + pos + 8]);
 457 | 
 458 |         b1 = _mm256_load_ps(&matBT.mat[matBToffset + pos]);
 459 |         b2 = _mm256_load_ps(&matBT.mat[matBToffset + pos + 8]);
 460 | 
 461 |         c1 = _mm256_fmadd_ps(a1, b1, c1);
 462 |         c2 = _mm256_fmadd_ps(a2, b2, c2);
 463 |     }
 464 | 
 465 |     c1 = _mm256_add_ps(c1, c2);
 466 |     _mm256_store_ps(&fps[0], c1);
 467 | 
 468 |     accumulate = 0;
 469 |     for (int i = 0; i < 8; ++i) {
 470 |         accumulate += fps[i];
 471 |     }
 472 | 
 473 |     /* store */
 474 |     matData[row * rowSpan + col] = accumulate;
 475 | }
 476 | 
 477 | /* Calculates a 1x3 block on the matrix C, (t,l,b,r)->(row,col,row+1,col+3) */
 478 | __declspec(noalias) void MMHelper_Mult1x3Blocks(float* __restrict const matData,
 479 |                                                 const unsigned rowSpan, const Mat& matA,
 480 |                                                 const Mat& matBT, const unsigned col,
 481 |                                                 const unsigned row)
 482 | {
 483 |     /* set up scalar array and accumulators for doing the horizontal sum (__m256 -> f32)
 484 |      * and storing its value. Horizontal sum is auto-vectorized by the compiler anyways. */
 485 |     __declspec(align(32)) float fps[8 * 3];
 486 |     __declspec(align(32)) float accumulate[3];
 487 | 
 488 |     /* we will be reusing these */
 489 |     const unsigned matAoffset = row * matA.rowSpan;
 490 |     const unsigned matBToffset1 = (col + 0) * matBT.rowSpan,
 491 |                    matBToffset2 = (col + 1) * matBT.rowSpan,
 492 |                    matBToffset3 = (col + 2) * matBT.rowSpan;
 493 | 
 494 |     /* set up accumulators */
 495 |     __m256 a1, b1, b2, b3;
 496 |     __m256 c1 = _mm256_setzero_ps();
 497 |     __m256 c2 = _mm256_setzero_ps();
 498 |     __m256 c3 = _mm256_setzero_ps();
 499 | 
 500 |     for (int pos = 0; pos < matA.width; pos += 8) {
 501 |         a1 = _mm256_load_ps(&matA.mat[matAoffset + pos]);
 502 | 
 503 |         b1 = _mm256_load_ps(&matBT.mat[matBToffset1 + pos]);
 504 |         b2 = _mm256_load_ps(&matBT.mat[matBToffset2 + pos]);
 505 |         b3 = _mm256_load_ps(&matBT.mat[matBToffset3 + pos]);
 506 | 
 507 |         c1 = _mm256_fmadd_ps(a1, b1, c1);
 508 |         c2 = _mm256_fmadd_ps(a1, b2, c2);
 509 |         c3 = _mm256_fmadd_ps(a1, b3, c3);
 510 |     }
 511 | 
 512 |     /* horizontal sum */
 513 | 
 514 |     memset(&accumulate[0], 0, 3 * sizeof(float));
 515 | 
 516 |     _mm256_store_ps(&fps[0], c1);
 517 |     _mm256_store_ps(&fps[8], c2);
 518 |     _mm256_store_ps(&fps[16], c3);
 519 | 
 520 |     /* autovectorized */
 521 |     for (int i = 0; i < 3; ++i) {
 522 |         for (int j = 0; j < 8; ++j) {
 523 |             accumulate[i] += fps[i * 8 + j];
 524 |         }
 525 |     }
 526 | 
 527 |     /* stores */
 528 |     matData[row * rowSpan + col + 0] = accumulate[0];
 529 |     matData[row * rowSpan + col + 1] = accumulate[1];
 530 |     matData[row * rowSpan + col + 2] = accumulate[2];
 531 | }
 532 | 
 533 | /* Calculates a 4x1 block on output matrix C. (t,l,b,r)->(row,col,row+4,col+1) */
 534 | __declspec(noalias) void MMHelper_Mult4x1Blocks(float* __restrict const matData,
 535 |                                                 const unsigned rowSpan, const Mat& matA,
 536 |                                                 const Mat& matBT, const unsigned col,
 537 |                                                 const unsigned row)
 538 | {
 539 |     /* set up scalar array and accumulators for doing the horizontal sum (__m256 -> f32)
 540 |     * and storing its value. Horizontal sum is auto-vectorized by the compiler anyways. */
 541 |     __declspec(align(32)) float fps[8 * 12];
 542 |     __declspec(align(32)) float accumulate[8 * 12];
 543 | 
 544 |     const unsigned matAoffset1 = (row + 0) * matA.rowSpan,
 545 |                    matAoffset2 = (row + 1) * matA.rowSpan,
 546 |                    matAoffset3 = (row + 2) * matA.rowSpan,
 547 |                    matAoffset4 = (row + 3) * matA.rowSpan;
 548 | 
 549 |     const unsigned matBToffset = col * matBT.rowSpan;
 550 | 
 551 |     /* set up accumulators */
 552 |     __m256 a11, a12, a21, a22, a31, a32, a41, a42, b1, b2;
 553 |     __m256 c1 = _mm256_setzero_ps();
 554 |     __m256 c2 = _mm256_setzero_ps();
 555 |     __m256 c3 = _mm256_setzero_ps();
 556 |     __m256 c4 = _mm256_setzero_ps();
 557 |     __m256 c5 = _mm256_setzero_ps();
 558 |     __m256 c6 = _mm256_setzero_ps();
 559 |     __m256 c7 = _mm256_setzero_ps();
 560 |     __m256 c8 = _mm256_setzero_ps();
 561 | 
 562 |     for (int pos = 0; pos < matA.width; pos += 16) {
 563 |         a11 = _mm256_load_ps(&matA.mat[matAoffset1 + pos]);
 564 |         a12 = _mm256_load_ps(&matA.mat[matAoffset1 + pos + 8]);
 565 | 
 566 |         a21 = _mm256_load_ps(&matA.mat[matAoffset2 + pos]);
 567 |         a22 = _mm256_load_ps(&matA.mat[matAoffset2 + pos + 8]);
 568 | 
 569 |         a31 = _mm256_load_ps(&matA.mat[matAoffset3 + pos]);
 570 |         a32 = _mm256_load_ps(&matA.mat[matAoffset3 + pos + 8]);
 571 | 
 572 |         a41 = _mm256_load_ps(&matA.mat[matAoffset4 + pos]);
 573 |         a42 = _mm256_load_ps(&matA.mat[matAoffset4 + pos + 8]);
 574 | 
 575 |         b1 = _mm256_load_ps(&matBT.mat[matBToffset + pos]);
 576 |         b2 = _mm256_load_ps(&matBT.mat[matBToffset + pos + 8]);
 577 | 
 578 |         c1 = _mm256_fmadd_ps(a11, b1, c1);
 579 |         c2 = _mm256_fmadd_ps(a21, b1, c2);
 580 |         c3 = _mm256_fmadd_ps(a31, b1, c3);
 581 |         c4 = _mm256_fmadd_ps(a41, b1, c4);
 582 | 
 583 |         c5 = _mm256_fmadd_ps(a12, b2, c5);
 584 |         c6 = _mm256_fmadd_ps(a22, b2, c6);
 585 |         c7 = _mm256_fmadd_ps(a32, b2, c7);
 586 |         c8 = _mm256_fmadd_ps(a42, b2, c8);
 587 |     }
 588 | 
 589 |     /* horizontal sum */
 590 | 
 591 |     memset(&accumulate[0], 0, 4 * sizeof(float));
 592 | 
 593 |     c1 = _mm256_add_ps(c1, c5);
 594 |     c2 = _mm256_add_ps(c2, c6);
 595 |     c3 = _mm256_add_ps(c3, c7);
 596 |     c4 = _mm256_add_ps(c4, c8);
 597 | 
 598 |     _mm256_store_ps(&fps[0], c1);
 599 |     _mm256_store_ps(&fps[8], c2);
 600 |     _mm256_store_ps(&fps[16], c3);
 601 |     _mm256_store_ps(&fps[24], c4);
 602 | 
 603 |     /* autovectorized */
 604 |     for (int i = 0; i < 4; ++i) {
 605 |         for (int j = 0; j < 8; ++j) {
 606 |             accumulate[i] += fps[i * 8 + j];
 607 |         }
 608 |     }
 609 | 
 610 |     /* stores */
 611 |     matData[(row + 0) * rowSpan + col] = accumulate[0];
 612 |     matData[(row + 1) * rowSpan + col] = accumulate[1];
 613 |     matData[(row + 2) * rowSpan + col] = accumulate[2];
 614 |     matData[(row + 3) * rowSpan + col] = accumulate[3];
 615 | }
 616 | 
 617 | /* Calculates a 4x3 block on output matrix C. (t,l,b,r)->(row,col,row+4,col+3) */
 618 | __declspec(noalias) void MMHelper_Mult4x3Blocks(float* __restrict const matData,
 619 |                                                 const unsigned rowSpan, const Mat& matA,
 620 |                                                 const Mat& matBT, const unsigned col,
 621 |                                                 const unsigned row)
 622 | {
 623 |     /* aligned placeholders and accumulators */
 624 |     __declspec(align(32)) float fps[8 * 12];
 625 |     __declspec(align(32)) float accumulate[12];
 626 | 
 627 |     const unsigned matAoffset1 = (row + 0) * matA.rowSpan,
 628 |                    matAoffset2 = (row + 1) * matA.rowSpan,
 629 |                    matAoffset3 = (row + 2) * matA.rowSpan,
 630 |                    matAoffset4 = (row + 3) * matA.rowSpan,
 631 |                    matBToffset1 = (col + 0) * matBT.rowSpan,
 632 |                    matBToffset2 = (col + 1) * matBT.rowSpan,
 633 |                    matBToffset3 = (col + 2) * matBT.rowSpan;
 634 | 
 635 |     /* 
 636 |      * <-----A.w----> <-----A.w---->
 637 |      * [----[a1]----] [----[b1]----]
 638 |      * [----[a2]----] [----[b2]----]
 639 |      * [----[a3]----] [----[b3]----]
 640 |      * [----[a4]----]      ^col
 641 |      *      ^ row          
 642 |      *
 643 |      * we are now computing dot product of 3 rows and 3 columns
 644 |      * at the same time, 1x8f vectors at a time.
 645 |      *
 646 |      * 3 ymm registers for b1:3,
 647 |      * 4*3 = 12 registers for the accumulators
 648 |      * 1 register for the temporary ai value loaded.
 649 |      * All 16 registers are used.
 650 |      * High arithmetic density: 7 loads -> 12 fma instructions
 651 |      *
 652 |      */
 653 | 
 654 |     /* set up SIMD variables */
 655 |     __m256 a, b1, b2, b3;
 656 |     __m256 c1 = _mm256_setzero_ps();
 657 |     __m256 c2 = _mm256_setzero_ps();
 658 |     __m256 c3 = _mm256_setzero_ps();
 659 |     __m256 c4 = _mm256_setzero_ps();
 660 |     __m256 c5 = _mm256_setzero_ps();
 661 |     __m256 c6 = _mm256_setzero_ps();
 662 |     __m256 c7 = _mm256_setzero_ps();
 663 |     __m256 c8 = _mm256_setzero_ps();
 664 |     __m256 c9 = _mm256_setzero_ps();
 665 |     __m256 c10 = _mm256_setzero_ps();
 666 |     __m256 c11 = _mm256_setzero_ps();
 667 |     __m256 c12 = _mm256_setzero_ps();
 668 | 
 669 |     /* if prefetch switch is set, 
 670 |      * prefetch first sections, one cache line at a time */
 671 |     if constexpr (doL12Prefetch) {
 672 |         _mm_prefetch((const char*)&matA.mat[matAoffset1], _MM_HINT_T0);
 673 |         _mm_prefetch((const char*)&matA.mat[matAoffset2], _MM_HINT_T0);
 674 |         _mm_prefetch((const char*)&matA.mat[matAoffset3], _MM_HINT_T0);
 675 |         _mm_prefetch((const char*)&matA.mat[matAoffset4], _MM_HINT_T0);
 676 | 
 677 |         _mm_prefetch((const char*)&matBT.mat[matBToffset1], _MM_HINT_T0);
 678 |         _mm_prefetch((const char*)&matBT.mat[matBToffset2], _MM_HINT_T0);
 679 |         _mm_prefetch((const char*)&matBT.mat[matBToffset3], _MM_HINT_T0);
 680 |     }
 681 | 
 682 |     /* do the dot products */
 683 |     for (int pos = 0; pos < matA.width; pos += 8) {
 684 |         if constexpr (doL12Prefetch) {
 685 |             if ((pos & (unsigned)15)) {
 686 |                 _mm_prefetch((const char*)&matA.mat[matAoffset1 + pos + 8],
 687 |                              _MM_HINT_T0);
 688 |             }
 689 |         }
 690 | 
 691 |         b1 = _mm256_load_ps(&matBT.mat[matBToffset1 + pos]);
 692 |         b2 = _mm256_load_ps(&matBT.mat[matBToffset2 + pos]);
 693 |         b3 = _mm256_load_ps(&matBT.mat[matBToffset3 + pos]);
 694 | 
 695 |         if constexpr (doL12Prefetch) {
 696 |             if ((pos & (unsigned)15)) {
 697 |                 _mm_prefetch((const char*)&matA.mat[matAoffset2 + pos + 8],
 698 |                              _MM_HINT_T0);
 699 |             }
 700 |         }
 701 | 
 702 |         a = _mm256_load_ps(&matA.mat[matAoffset1 + pos]);
 703 |         c1 = _mm256_fmadd_ps(a, b1, c1);
 704 |         c2 = _mm256_fmadd_ps(a, b2, c2);
 705 |         c3 = _mm256_fmadd_ps(a, b3, c3);
 706 | 
 707 |         if constexpr (doL12Prefetch) {
 708 |             if ((pos & (unsigned)15)) {
 709 |                 _mm_prefetch((const char*)&matA.mat[matAoffset3 + pos + 8],
 710 |                              _MM_HINT_T0);
 711 |             }
 712 |         }
 713 |         a = _mm256_load_ps(&matA.mat[matAoffset2 + pos]);
 714 |         c4 = _mm256_fmadd_ps(a, b1, c4);
 715 |         c5 = _mm256_fmadd_ps(a, b2, c5);
 716 |         c6 = _mm256_fmadd_ps(a, b3, c6);
 717 | 
 718 |         if constexpr (doL12Prefetch) {
 719 |             if ((pos & (unsigned)15)) {
 720 |                 _mm_prefetch((const char*)&matA.mat[matAoffset4 + pos + 8],
 721 |                              _MM_HINT_T0);
 722 |             }
 723 |         }
 724 | 
 725 |         a = _mm256_load_ps(&matA.mat[matAoffset3 + pos]);
 726 |         c7 = _mm256_fmadd_ps(a, b1, c7);
 727 |         c8 = _mm256_fmadd_ps(a, b2, c8);
 728 |         c9 = _mm256_fmadd_ps(a, b3, c9);
 729 | 
 730 |         if constexpr (doL12Prefetch) {
 731 |             if ((pos & (unsigned)15)) {
 732 |                 _mm_prefetch((const char*)&matBT.mat[matBToffset1 + pos + 8],
 733 |                              _MM_HINT_T0);
 734 |                 _mm_prefetch((const char*)&matBT.mat[matBToffset2 + pos + 8],
 735 |                              _MM_HINT_T0);
 736 |                 _mm_prefetch((const char*)&matBT.mat[matBToffset3 + pos + 8],
 737 |                              _MM_HINT_T0);
 738 |             }
 739 |         }
 740 | 
 741 |         a = _mm256_load_ps(&matA.mat[matAoffset4 + pos]);
 742 |         c10 = _mm256_fmadd_ps(a, b1, c10);
 743 |         c11 = _mm256_fmadd_ps(a, b2, c11);
 744 |         c12 = _mm256_fmadd_ps(a, b3, c12);
 745 |     }
 746 | 
 747 |     /* horizontal sum */
 748 |     memset(&accumulate[0], 0, 12 * sizeof(float));
 749 | 
 750 |     _mm256_store_ps(&fps[0], c1);
 751 |     _mm256_store_ps(&fps[8], c2);
 752 |     _mm256_store_ps(&fps[16], c3);
 753 |     _mm256_store_ps(&fps[24], c4);
 754 |     _mm256_store_ps(&fps[32], c5);
 755 |     _mm256_store_ps(&fps[40], c6);
 756 |     _mm256_store_ps(&fps[48], c7);
 757 |     _mm256_store_ps(&fps[56], c8);
 758 |     _mm256_store_ps(&fps[64], c9);
 759 |     _mm256_store_ps(&fps[72], c10);
 760 |     _mm256_store_ps(&fps[80], c11);
 761 |     _mm256_store_ps(&fps[88], c12);
 762 | 
 763 |     for (int i = 0; i < 12; ++i) {
 764 |         for (int j = 0; j < 8; ++j) {
 765 |             accumulate[i] += fps[i * 8 + j];
 766 |         }
 767 |     }
 768 | 
 769 |     /* stores */
 770 |     matData[(row + 0) * rowSpan + col + 0] = accumulate[0];
 771 |     matData[(row + 0) * rowSpan + col + 1] = accumulate[1];
 772 |     matData[(row + 0) * rowSpan + col + 2] = accumulate[2];
 773 | 
 774 |     matData[(row + 1) * rowSpan + col + 0] = accumulate[3];
 775 |     matData[(row + 1) * rowSpan + col + 1] = accumulate[4];
 776 |     matData[(row + 1) * rowSpan + col + 2] = accumulate[5];
 777 | 
 778 |     matData[(row + 2) * rowSpan + col + 0] = accumulate[6];
 779 |     matData[(row + 2) * rowSpan + col + 1] = accumulate[7];
 780 |     matData[(row + 2) * rowSpan + col + 2] = accumulate[8];
 781 | 
 782 |     matData[(row + 3) * rowSpan + col + 0] = accumulate[9];
 783 |     matData[(row + 3) * rowSpan + col + 1] = accumulate[10];
 784 |     matData[(row + 3) * rowSpan + col + 2] = accumulate[11];
 785 | }
 786 | 
 787 | /* 
 788 |  * Compute L2Y x L2X sized blocks from the output matrix C.
 789 |  * In order to keep this code nice and hot in instruction cache,
 790 |  * keep it restricted to full blocks of L2X x L2Y.
 791 |  */
 792 | __declspec(noalias) void MMHelper_MultL2Blocks(float* __restrict const matData,
 793 |                                                const unsigned rowSpan, const Mat& matA,
 794 |                                                const Mat& matBT, const unsigned col,
 795 |                                                const unsigned row,
 796 |                                                const unsigned L2BlockX,
 797 |                                                const unsigned L2BlockY)
 798 | {
 799 |     /* multiply 4x3 blocks, L2blockX == 3*k, L2blockY == 4*m */
 800 |     for (int blockRow = row; blockRow < row + L2BlockY; blockRow += 4) {
 801 |         for (int blockCol = col; blockCol < col + L2BlockX; blockCol += 3) {
 802 |             MMHelper_Mult4x3Blocks(matData, rowSpan, matA, matBT, blockCol, blockRow);
 803 |         }
 804 |     }
 805 | }
 806 | 
 807 | /* Compute K x K sized blocks from the output matrix C. see struct mmBlockInfo */
 808 | __declspec(noalias) void MMHelper_MultFullBlocks(float* __restrict const matData,
 809 |                                                  const unsigned rowSpan,
 810 |                                                  const Mat& matA, const Mat& matBT,
 811 |                                                  const unsigned colC,
 812 |                                                  const unsigned rowC,
 813 |                                                  const MMBlockInfo& mmBlockInfo)
 814 | {
 815 |     const unsigned L2BlockX = mmBlockInfo.L2BlockX, L2BlockY = mmBlockInfo.L2BlockY,
 816 |                    L3BlockX = mmBlockInfo.L3BlockX, L3BlockY = mmBlockInfo.L3BlockY,
 817 |                    issuedBlockSzX = mmBlockInfo.issuedBlockSzX,
 818 |                    issuedBlockSzY = mmBlockInfo.issuedBlockSzY;
 819 | 
 820 |     /* try to prefetch next bit of block into memory while still handling this one */
 821 |     {
 822 |         if constexpr (doL3Prefetch) {
 823 |             std::unique_lock<std::mutex> lock(prefetchMutex);
 824 |             int alreadyPrefetchedCol =
 825 |               prefetched[rowC / L3BlockY][colC / issuedBlockSzX];
 826 |             lock.unlock();
 827 |             if (!alreadyPrefetchedCol) {
 828 |                 for (int c = colC + issuedBlockSzX; c < colC + issuedBlockSzX; ++c) {
 829 |                     for (int pos = 0; pos < matA.rowSpan;
 830 |                          pos += cacheLineSz / sizeof(float)) {
 831 |                         _mm_prefetch((const char*)&matBT.mat[c * matBT.rowSpan + pos],
 832 |                                      _MM_HINT_T2);
 833 |                     }
 834 |                 }
 835 |                 lock.lock();
 836 |                 prefetched[rowC / L3BlockY][colC / issuedBlockSzX]++;
 837 |                 lock.unlock();
 838 |             }
 839 |         }
 840 |     }
 841 | 
 842 |     /* multiply L2YxL2X blocks */
 843 |     for (int blockColC = colC; blockColC < colC + issuedBlockSzX;
 844 |          blockColC += L2BlockX) {
 845 |         for (int blockRowC = rowC; blockRowC < rowC + issuedBlockSzY;
 846 |              blockRowC += L2BlockY) {
 847 |             MMHelper_MultL2Blocks(matData, rowSpan, matA, matBT, blockColC, blockRowC,
 848 |                                   L2BlockX, L2BlockY);
 849 |         }
 850 |     }
 851 | }
 852 | 
 853 | /* 
 854 |  * This function divides the matrix multiplication into segments and
 855 |  * issues commands for a cache aware thread pool to handle them.
 856 |  * Uses the helper functions above. 
 857 |  */
 858 | __declspec(noalias) const Mat MTMatMul(const Mat& matA, const Mat& matB)
 859 | {
 860 |     /* if CPU information is not already queried, do so */
 861 |     if (!CPUInfoQueried) {
 862 |         int dCaches[3];
 863 |         int iCache;
 864 | 
 865 |         CPUUtil::GetCacheInfo(&dCaches[0], iCache);
 866 | 
 867 |         L2Size = dCaches[1];
 868 |         L3Size = dCaches[2];
 869 | 
 870 |         cacheLineSz = CPUUtil::GetCacheLineSize();
 871 | 
 872 |         CPUInfoQueried++;
 873 |     }
 874 | 
 875 |     /* allocate the aligned float array for our new matrix C */
 876 |     float* __restrict const matData =
 877 |       (float*)_aligned_malloc(matA.height * matB.rowSpan * sizeof(float), AVX_ALIGN);
 878 | 
 879 |     /* construct matrix C */
 880 |     Mat matC{matB.width, matA.height, matB.rowSpan, matData};
 881 | 
 882 |     /* for the sake of cache, we'll be working with transposed B */
 883 |     const Mat matBT = TransposeMat(matB);
 884 | 
 885 |     /* initialize the HWLocalThreadPool with 1 or 2 threads per physical core
 886 |     * for all physical cores. Number of threads per core depends on HTT status. */
 887 |     const int HTTEnabled = CPUUtil::GetHTTStatus();
 888 |     const int jobStride = (1 << HTTEnabled);
 889 |     HWLocalThreadPool tp(0, jobStride);
 890 | 
 891 |     /* decide the block sizes for the given matrix and CPU */
 892 |     const float invN = 1.0 / matA.rowSpan;
 893 | 
 894 |     int QL2 = invN * L2Size / sizeof(float);
 895 |     int QL3 = invN * L3Size / sizeof(float);
 896 |     int k = min(max(QL2 / 6, 1), 10);
 897 |     int m = min(max(QL2 / 8, 1), 10);
 898 |     int L2BlockX = 3 * k;
 899 |     int L2BlockY = 4 * m;
 900 |     int lcmMN = std::lcm(k, m);
 901 |     int L3BlockX = min(max(QL3 / 120 / lcmMN * lcmMN * 60, 12*L2BlockX), 360);
 902 |     int L3BlockY = L3BlockX;
 903 |     int issuedBlockSzX = L3BlockX / 4;
 904 |     int issuedBlockSzY = L3BlockY / 3;
 905 | 
 906 |     /*printf("%d %d\n%d %d %d %d %d %d\n", matC.height, matC.width, L2BlockX, L2BlockY, issuedBlockSzX, issuedBlockSzY,
 907 |            L3BlockX, L3BlockY);*/
 908 | 
 909 |     MMBlockInfo mmBlockInfo{L3BlockX, L3BlockY,       L2BlockX,
 910 |                             L2BlockY, issuedBlockSzX, issuedBlockSzY};
 911 | 
 912 |     /* before we begin, start prefetching the first L3 level block */
 913 |     /* reset the prefetched flags */
 914 |     memset(&prefetched[0][0], 0, 1024 * 1024 * sizeof(int));
 915 |     /* prefetch rows of A and columns of B, one cache line at a time */
 916 |     for (int r = 0; r < L3BlockY; ++r) {
 917 |         for (int pos = 0; pos < matA.rowSpan; pos += cacheLineSz / sizeof(float)) {
 918 |             _mm_prefetch((const char*)&matA.mat[r * matA.rowSpan + pos], _MM_HINT_T2);
 919 |         }
 920 |     }
 921 |     for (int c = 0; c < L3BlockX; ++c) {
 922 |         for (int pos = 0; pos < matA.rowSpan; pos += cacheLineSz / sizeof(float)) {
 923 |             _mm_prefetch((const char*)&matBT.mat[c * matBT.rowSpan + pos], _MM_HINT_T2);
 924 |         }
 925 |     }
 926 |     /* prefetch is called for the first block, mark it. */
 927 |     prefetched[0][0]++;
 928 | 
 929 |     /* start issuing jobs for the thread pool */
 930 | 
 931 |     /*
 932 |      * We incorporate multiple levels of tiling into our traversal.
 933 |      *
 934 |      * If we issue commands linearly, we'll have poor L3 cache utilization.
 935 |      * [ [C0T0 | C0T1] [C1T0 | C1T1] ... [C5T0 | C5T1] ] covering a rows, b columns,
 936 |      * (a+b)N floats of data is needed to compute a*b sized block.
 937 |      * So, instead, we issue commands in the blocked manner, like:
 938 |      * [ [C0T0 | C0T1] [C1T0 | C1T1] 
 939 |      *   [C2T0 | C5T1] [C2T0 | C2T1] ] 
 940 |      *
 941 |      * Traverse L3 sized blocks, 
 942 |      * inside each, issue issuedBlockSz sized blocks.
 943 |      */
 944 | 
 945 |     int rowC = 0;
 946 |     /* handle L3Y sized rows
 947 |      * cast unsigned dimensions to signed to avoid UB */
 948 |     for (; rowC <= (int)matA.height - L3BlockY; rowC += L3BlockY) {
 949 |         int colC = 0;
 950 |         /* handle L3Y x L3X sized blocks */
 951 |         for (; colC <= (int)matB.width - L3BlockX; colC += L3BlockX) {
 952 |             /* Issue issuedBlockSzY x issuedBlockSzX sized blocks */
 953 |             for (int blockRowC = rowC; blockRowC < rowC + L3BlockY;
 954 |                  blockRowC += issuedBlockSzY) {
 955 |                 for (int blockColC = colC; blockColC < colC + L3BlockX;
 956 |                      blockColC += jobStride * issuedBlockSzX) {
 957 |                     tp.Add({
 958 |                         HWLocalThreadPool::WrapFunc(MMHelper_MultFullBlocks, matData,
 959 |                                                     matB.rowSpan, matA, matBT, blockColC,
 960 |                                                     blockRowC, mmBlockInfo),
 961 |                         HWLocalThreadPool::WrapFunc(MMHelper_MultFullBlocks, matData, 
 962 |                                                     matB.rowSpan, matA, matBT,
 963 |                                                     blockColC + issuedBlockSzX, 
 964 |                                                     blockRowC, mmBlockInfo)
 965 |                         });
 966 |                 }
 967 |             }
 968 |         }
 969 |         /* handle the block w < L3X, h = L3Y at the end of the row */
 970 |         if (matB.width > colC) {
 971 |             const unsigned remSubX = (matB.width - colC) >> HTTEnabled;
 972 |             tp.Add({
 973 |                 HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData,
 974 |                                             matB.rowSpan, matA, matBT, colC, rowC,
 975 |                                             remSubX, L3BlockY, mmBlockInfo),
 976 |                 HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData, 
 977 |                                             matB.rowSpan, matA, matBT,
 978 |                                             colC + remSubX, rowC, 
 979 |                                             matB.width - colC - remSubX, L3BlockY,
 980 |                                             mmBlockInfo)
 981 |                 });
 982 |         }
 983 |     }
 984 |     /* handle last row, h < L3Y */
 985 |     int colC = 0;
 986 |     /* first handle blocks of w = L3X, h < L3Y */
 987 |     for (; colC <= (int)matB.width - L3BlockX; colC += jobStride * issuedBlockSzX) {
 988 |         tp.Add({
 989 |             HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData, 
 990 |                                         matB.rowSpan, matA, matBT, colC,
 991 |                                         rowC, issuedBlockSzX, matA.height - rowC, 
 992 |                                         mmBlockInfo),
 993 |             HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData,
 994 |                                         matB.rowSpan, matA, matBT,
 995 |                                         colC + issuedBlockSzX, rowC, issuedBlockSzX,
 996 |                                         matA.height - rowC, mmBlockInfo)});
 997 |     }
 998 |     /* now handle the rightmost block of w < L3X, h < L3Y */
 999 |     tp.Add({HWLocalThreadPool::WrapFunc(MMHelper_MultAnyBlocks, matData, matB.rowSpan,
1000 |                                         matA, matBT, colC, rowC, matB.width - colC,
1001 |                                         matA.height - rowC, mmBlockInfo),
1002 |         []() {}});
1003 | 
1004 |     /* -- commands issued -- */
1005 | 
1006 |     /* wait for the thread pool to finish */
1007 |     tp.Close();
1008 |     /* free the temporary bT matrix */
1009 |     _aligned_free(matBT.mat);
1010 | 
1011 |     return matC;
1012 | }
1013 | 
1014 | /* MatMul function, a simple branch that calls the proper implementation
1015 |  * based on the complexity of the input matrix. */
1016 | const Mat MatMul(const Mat& matA, const Mat& matB)
1017 | {
1018 |     /* 
1019 |      * If complexity is low enough,
1020 |      * use the single threaded, transposed B method.
1021 |      * A(N, M) B(M, K) => # of ops ~= 2*N*K*M 
1022 |      */
1023 |     if (matA.height * matA.width * matB.width < 350 * 350 * 350) {
1024 |         return ST_TransposedBMatMul(matA, matB);
1025 |     }
1026 |     return MTMatMul(matA, matB);
1027 | }
1028 | 
1029 | int __cdecl main(int argc, char* argv[])
1030 | {
1031 |     if (argc < 4) {
1032 |         std::cout << "No args\n";
1033 |         return 0;
1034 |     }
1035 | 
1036 |     /* make sure the runtime system supports AVX and FMA ISAs */
1037 |     assert(CPUUtil::GetSIMDSupport());
1038 | 
1039 |     const char* inputMtxAFile = argv[1];
1040 |     const char* inputMtxBFile = argv[2];
1041 |     const char* outMtxABFile = argv[3];
1042 | 
1043 |     //const char* inputMtxAFile = "matrixAx.bin";
1044 |     //const char* inputMtxBFile = "matrixBx.bin";
1045 |     //const char* outMtxABFile = "matrixAB-out.bin";
1046 | 
1047 |     const Mat inputMtxA = LoadMat(inputMtxAFile);
1048 |     const Mat inputMtxB = LoadMat(inputMtxBFile);
1049 | 
1050 |     /*printf("%d %d %d %d\n", inputMtxA.height, inputMtxA.width, inputMtxB.height,
1051 |            inputMtxB.width);*/
1052 | 
1053 |     auto start = std::chrono::high_resolution_clock::now();
1054 |     const Mat outMtxAB = MatMul(inputMtxA, inputMtxB);
1055 |     auto end = std::chrono::high_resolution_clock::now();
1056 | 
1057 |     std::cout
1058 |       << "Matrix Multiplication: "
1059 |       << std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()
1060 |       << " microseconds.\n";
1061 | 
1062 |     DumpMat(outMtxABFile, outMtxAB);
1063 | 
1064 |     FreeMat(inputMtxA);
1065 |     FreeMat(inputMtxB);
1066 |     FreeMat(outMtxAB);
1067 | 
1068 |     return 0;
1069 | }
1070 | 


--------------------------------------------------------------------------------
/MatrixMult/MatrixMult.vcxproj:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <ItemGroup Label="ProjectConfigurations">
  4 |     <ProjectConfiguration Include="Debug|Win32">
  5 |       <Configuration>Debug</Configuration>
  6 |       <Platform>Win32</Platform>
  7 |     </ProjectConfiguration>
  8 |     <ProjectConfiguration Include="Release|Win32">
  9 |       <Configuration>Release</Configuration>
 10 |       <Platform>Win32</Platform>
 11 |     </ProjectConfiguration>
 12 |     <ProjectConfiguration Include="Debug|x64">
 13 |       <Configuration>Debug</Configuration>
 14 |       <Platform>x64</Platform>
 15 |     </ProjectConfiguration>
 16 |     <ProjectConfiguration Include="Release|x64">
 17 |       <Configuration>Release</Configuration>
 18 |       <Platform>x64</Platform>
 19 |     </ProjectConfiguration>
 20 |   </ItemGroup>
 21 |   <ItemGroup>
 22 |     <ClCompile Include="CPUUtil.cpp" />
 23 |     <ClCompile Include="MatrixMul.cpp" />
 24 |   </ItemGroup>
 25 |   <ItemGroup>
 26 |     <ClInclude Include="CPUUtil.h" />
 27 |     <ClInclude Include="ThreadPool.h" />
 28 |   </ItemGroup>
 29 |   <PropertyGroup Label="Globals">
 30 |     <VCProjectVersion>15.0</VCProjectVersion>
 31 |     <ProjectGuid>{54F1A74A-017A-4CF8-9D98-BE6E04DD2DE7}</ProjectGuid>
 32 |     <RootNamespace>MatrixMult</RootNamespace>
 33 |     <WindowsTargetPlatformVersion>10.0.16299.0</WindowsTargetPlatformVersion>
 34 |   </PropertyGroup>
 35 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 36 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
 37 |     <ConfigurationType>Application</ConfigurationType>
 38 |     <UseDebugLibraries>true</UseDebugLibraries>
 39 |     <PlatformToolset>v141</PlatformToolset>
 40 |     <CharacterSet>MultiByte</CharacterSet>
 41 |   </PropertyGroup>
 42 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
 43 |     <ConfigurationType>Application</ConfigurationType>
 44 |     <UseDebugLibraries>false</UseDebugLibraries>
 45 |     <PlatformToolset>v141</PlatformToolset>
 46 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 47 |     <CharacterSet>MultiByte</CharacterSet>
 48 |   </PropertyGroup>
 49 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
 50 |     <ConfigurationType>Application</ConfigurationType>
 51 |     <UseDebugLibraries>true</UseDebugLibraries>
 52 |     <PlatformToolset>v141</PlatformToolset>
 53 |     <CharacterSet>MultiByte</CharacterSet>
 54 |   </PropertyGroup>
 55 |   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
 56 |     <ConfigurationType>Application</ConfigurationType>
 57 |     <UseDebugLibraries>false</UseDebugLibraries>
 58 |     <PlatformToolset>v141</PlatformToolset>
 59 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 60 |     <CharacterSet>MultiByte</CharacterSet>
 61 |   </PropertyGroup>
 62 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 63 |   <ImportGroup Label="ExtensionSettings">
 64 |   </ImportGroup>
 65 |   <ImportGroup Label="Shared">
 66 |   </ImportGroup>
 67 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 68 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 69 |   </ImportGroup>
 70 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
 71 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 72 |   </ImportGroup>
 73 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 74 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 75 |   </ImportGroup>
 76 |   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
 77 |     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
 78 |   </ImportGroup>
 79 |   <PropertyGroup Label="UserMacros" />
 80 |   <PropertyGroup />
 81 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
 82 |     <ClCompile>
 83 |       <WarningLevel>Level3</WarningLevel>
 84 |       <Optimization>MaxSpeed</Optimization>
 85 |       <SDLCheck>true</SDLCheck>
 86 |       <ConformanceMode>true</ConformanceMode>
 87 |       <IntrinsicFunctions>true</IntrinsicFunctions>
 88 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
 89 |       <BasicRuntimeChecks>Default</BasicRuntimeChecks>
 90 |     </ClCompile>
 91 |     <Link>
 92 |       <SubSystem>Console</SubSystem>
 93 |     </Link>
 94 |   </ItemDefinitionGroup>
 95 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
 96 |     <ClCompile>
 97 |       <WarningLevel>Level3</WarningLevel>
 98 |       <Optimization>Disabled</Optimization>
 99 |       <SDLCheck>true</SDLCheck>
100 |       <ConformanceMode>true</ConformanceMode>
101 |       <IntrinsicFunctions>true</IntrinsicFunctions>
102 |       <BasicRuntimeChecks>Default</BasicRuntimeChecks>
103 |       <BufferSecurityCheck>false</BufferSecurityCheck>
104 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
105 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
106 |       <WholeProgramOptimization>false</WholeProgramOptimization>
107 |       <OpenMPSupport>false</OpenMPSupport>
108 |       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
109 |       <FloatingPointModel>Fast</FloatingPointModel>
110 |       <AdditionalOptions>/Qvec-report:2 %(AdditionalOptions)</AdditionalOptions>
111 |       <OmitFramePointers>false</OmitFramePointers>
112 |       <LanguageStandard>stdcpp17</LanguageStandard>
113 |       <ExceptionHandling>SyncCThrow</ExceptionHandling>
114 |     </ClCompile>
115 |     <Link>
116 |       <SubSystem>Console</SubSystem>
117 |     </Link>
118 |   </ItemDefinitionGroup>
119 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
120 |     <ClCompile>
121 |       <WarningLevel>Level3</WarningLevel>
122 |       <Optimization>MaxSpeed</Optimization>
123 |       <FunctionLevelLinking>true</FunctionLevelLinking>
124 |       <IntrinsicFunctions>true</IntrinsicFunctions>
125 |       <SDLCheck>true</SDLCheck>
126 |       <ConformanceMode>true</ConformanceMode>
127 |     </ClCompile>
128 |     <Link>
129 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
130 |       <OptimizeReferences>true</OptimizeReferences>
131 |       <Profile>true</Profile>
132 |     </Link>
133 |   </ItemDefinitionGroup>
134 |   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
135 |     <ClCompile>
136 |       <WarningLevel>Level3</WarningLevel>
137 |       <FunctionLevelLinking>true</FunctionLevelLinking>
138 |       <IntrinsicFunctions>true</IntrinsicFunctions>
139 |       <SDLCheck>true</SDLCheck>
140 |       <ConformanceMode>true</ConformanceMode>
141 |       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
142 |       <EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
143 |       <FloatingPointModel>Fast</FloatingPointModel>
144 |       <StringPooling>true</StringPooling>
145 |       <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
146 |       <AdditionalOptions>/Qvec-report:2 /Qpar-report:2 %(AdditionalOptions)</AdditionalOptions>
147 |       <OpenMPSupport>false</OpenMPSupport>
148 |       <ExceptionHandling>false</ExceptionHandling>
149 |       <RuntimeTypeInfo>false</RuntimeTypeInfo>
150 |       <LanguageStandard>stdcpp17</LanguageStandard>
151 |       <MultiProcessorCompilation>true</MultiProcessorCompilation>
152 |       <Parallelization>false</Parallelization>
153 |       <UseIntelOptimizedHeaders>true</UseIntelOptimizedHeaders>
154 |       <RecognizeRestrictKeyword>true</RecognizeRestrictKeyword>
155 |       <EnableMatrixMultiplyLibraryCall>No</EnableMatrixMultiplyLibraryCall>
156 |       <OmitFramePointers>false</OmitFramePointers>
157 |       <FloatingPointExceptions>false</FloatingPointExceptions>
158 |       <FloatingPointExpressionEvaluation />
159 |       <GenerateAlternateCodePaths>COFFEELAKE</GenerateAlternateCodePaths>
160 |       <UseProcessorExtensions>COFFEELAKE</UseProcessorExtensions>
161 |       <Mtune>Coffeelake</Mtune>
162 |     </ClCompile>
163 |     <Link>
164 |       <EnableCOMDATFolding>true</EnableCOMDATFolding>
165 |       <OptimizeReferences>true</OptimizeReferences>
166 |       <SubSystem>Console</SubSystem>
167 |       <Profile>true</Profile>
168 |     </Link>
169 |   </ItemDefinitionGroup>
170 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
171 |   <ImportGroup Label="ExtensionTargets">
172 |   </ImportGroup>
173 | </Project>


--------------------------------------------------------------------------------
/MatrixMult/MatrixMult.vcxproj.filters:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
 3 |   <ItemGroup>
 4 |     <Filter Include="Source Files">
 5 |       <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
 6 |       <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
 7 |     </Filter>
 8 |     <Filter Include="Header Files">
 9 |       <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
10 |       <Extensions>h;hh;hpp;hxx;hm;inl;inc;ipp;xsd</Extensions>
11 |     </Filter>
12 |     <Filter Include="Resource Files">
13 |       <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
14 |       <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
15 |     </Filter>
16 |   </ItemGroup>
17 |   <ItemGroup>
18 |     <ClCompile Include="MatrixMul.cpp">
19 |       <Filter>Source Files</Filter>
20 |     </ClCompile>
21 |     <ClCompile Include="CPUUtil.cpp">
22 |       <Filter>Source Files</Filter>
23 |     </ClCompile>
24 |   </ItemGroup>
25 |   <ItemGroup>
26 |     <ClInclude Include="ThreadPool.h">
27 |       <Filter>Header Files</Filter>
28 |     </ClInclude>
29 |     <ClInclude Include="CPUUtil.h">
30 |       <Filter>Header Files</Filter>
31 |     </ClInclude>
32 |   </ItemGroup>
33 | </Project>


--------------------------------------------------------------------------------
/MatrixMult/ThreadPool.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <functional>
  3 | #include <thread>
  4 | #include <queue>
  5 | #include <mutex>
  6 | #include <tuple>
  7 | #include <vector>
  8 | #include <iostream>
  9 | #include <cmath>
 10 | #include <future>
 11 | #include <array>
 12 | #include <cassert>
 13 | #include "CPUUtil.h"
 14 | 
 15 | /*
 16 |  * Thread pool that respects cache locality on HyperThreaded CPUs (WIN32 API dependent)
 17 |  *
 18 |  * Each job is described as an array of N functions. (ideal N=2 for HT)
 19 |  * For each job, N threads are created and assigned respective functions.
 20 |  * For a given job, all threads are guaranteed to be on the same physical core.
 21 |  * No two threads from different jobs are allowed on the same physical core.
 22 |  *
 23 |  *
 24 |  * Why?
 25 |  *   When doing multithreading on cache sensitive tasks, 
 26 |  *   we want to keep threads that operate on same or contiguous memory region 
 27 |  *     on the same physical core s.t they share the same L2 cache.
 28 |  *
 29 |  * Reference: This code is influenced by writeup that explains thread pools at
 30 |  * https://github.com/mtrebi/thread-pool/blob/master/README.md
 31 |  *
 32 |  * Structure:
 33 |  *   CPUUtil:
 34 |  *     Uses Windows API to detect the number of physical cores, cache sizes 
 35 |  *       and mapping between physical and logical processors.
 36 |  *
 37 |  *   HWLocalThreadPool:
 38 |  *     Submission:
 39 |  *       initializer list or vector of (void function (void)) of length N
 40 |  *         where N is the num of threads that will spawn on the same core,
 41 |  *         and, the length of the std::function array. 
 42 |  *         ith thread handles repective ith function
 43 |  *     
 44 |  *     Core Handlers:
 45 |  *       We create NumHWCores many CoreHandler objects.
 46 |  *       These objects are responsible for managing their cores.
 47 |  *       They check the main pool for jobs, when a job is found,
 48 |  *           if N==1   ,   they call the only function in the job description.
 49 |  *           if N>1    ,   they assign N-1 threads on the same physical core to,
 50 |  *                         respective functions in the array. The CoreHandler is 
 51 |  *                         assigned to the first function.
 52 |  *       Once CoreHandler finishes its own task, it waits for other threads,
 53 |  *       Then its available for new jobs, waiting to be notified by the pool manager.
 54 |  *     
 55 |  *     Thread Handlers:
 56 |  *       Responsible for handling tasks handed away by the CoreHandler.
 57 |  *       When they finish execution, they signal to notify CoreHandler 
 58 |  *       Then, they wait for a new task to run until they are terminated.
 59 |  * 
 60 |  * Notes:
 61 |  * 
 62 |  *   DON'T KEEP THESE TASKS TOO SMALL. 
 63 |  *   We don't want our CoreHandler to check its childrens states constantly,
 64 |  *   So, when a thread finishes a task, we signal the CoreHandler.
 65 |  *   This might become a overhead if the task itself is trivial.
 66 |  *   In that case you probably shouldn't be using this structure anyways,
 67 |  *   But if you want to, you can change it so that,
 68 |  *   CoreHandler periodically checks m_childThreadOnline array and sleeps in between.
 69 |  *
 70 |  */
 71 | 
 72 | class HWLocalThreadPool {
 73 | public:
 74 |     HWLocalThreadPool(int _numOfCoresToUse, int _numThreadsPerCore) : m_terminate(false)
 75 |     {
 76 |         m_numHWCores = CPUUtil::GetNumHWCores();
 77 | 
 78 |         if (_numOfCoresToUse <= 0) {
 79 |             m_numCoreHandlers = m_numHWCores;
 80 |         } else {
 81 |             m_numCoreHandlers = _numOfCoresToUse;
 82 |         }
 83 | 
 84 |         if (_numThreadsPerCore <= 0) {
 85 |             m_numThreadsPerCore =
 86 |               CPUUtil::GetNumLogicalProcessors() / m_numCoreHandlers;
 87 |         } else {
 88 |             m_numThreadsPerCore = _numThreadsPerCore;
 89 |         }
 90 | 
 91 |         /* malloc m_coreHandlers s.t no default initialization takes place, 
 92 |         we construct every object with placement new */
 93 |         m_coreHandlers = (CoreHandler*)malloc(m_numCoreHandlers * sizeof(CoreHandler));
 94 |         m_coreHandlerThreads = new std::thread[m_numCoreHandlers];
 95 | 
 96 |         for (int i = 0; i < m_numCoreHandlers; ++i) {
 97 |             ULONG_PTR processAffinityMask;
 98 |             int maskQueryRetCode = CPUUtil::GetProcessorMask(i, processAffinityMask);
 99 |             if (maskQueryRetCode) {
100 |                 assert(0, "Can't query processor relations.");
101 |                 return;
102 |             }
103 |             CoreHandler* coreHandler =
104 |               new (&m_coreHandlers[i]) CoreHandler(this, i, processAffinityMask);
105 |             m_coreHandlerThreads[i] = std::thread(std::ref(m_coreHandlers[i]));
106 |         }
107 |     }
108 | 
109 |     ~HWLocalThreadPool()
110 |     {
111 |         if (!m_terminate)
112 |             Close();
113 |     }
114 | 
115 |     void Add(std::vector<std::function<void()>> const& F)
116 |     {
117 |         m_queue.Push(F);
118 |         m_queueToCoreNotifier.notify_one();
119 |     }
120 | 
121 |     /* if finishQueue is set, cores will termianate after handling every job at the queue
122 |     if not, they will finish the current job they have and terminate. */
123 |     void Close(const bool finishQueue = true)
124 |     {
125 |         {
126 |             std::unique_lock<std::mutex> lock(m_queueMutex);
127 |             m_terminate = 1;
128 |             m_waitToFinish = finishQueue;
129 |             m_queueToCoreNotifier.notify_all();
130 |         }
131 | 
132 |         for (int i = 0; i < m_numCoreHandlers; ++i) {
133 |             if (m_coreHandlerThreads[i].joinable())
134 |                 m_coreHandlerThreads[i].join();
135 |         }
136 | 
137 |         /* free doesn't call the destructor, so  */
138 |         for (int i = 0; i < m_numCoreHandlers; ++i) {
139 |             m_coreHandlers[i].~CoreHandler();
140 |         }
141 |         free(m_coreHandlers);
142 |         delete[] m_coreHandlerThreads;
143 |     }
144 | 
145 |     const unsigned NumCores()
146 |     {
147 |         return m_numHWCores;
148 |     }
149 | 
150 |     const unsigned NumThreadsPerCore()
151 |     {
152 |         return m_numThreadsPerCore;
153 |     }
154 | 
155 |     template <typename F, typename... Args>
156 |     static std::function<void()> WrapFunc(F&& f, Args&&... args)
157 |     {
158 |         std::function<decltype(f(args...))()> func =
159 |           std::bind(std::forward<F>(f), std::forward<Args>(args)...);
160 |         auto task_ptr =
161 |           std::make_shared<std::packaged_task<decltype(f(args...))()>>(func);
162 | 
163 |         std::function<void()> wrapper_func = [task_ptr]() { (*task_ptr)(); };
164 | 
165 |         return wrapper_func;
166 |     }
167 | 
168 | protected:
169 |     template <typename T> class Queue {
170 |     public:
171 |         Queue()
172 |         {
173 |         }
174 |         ~Queue()
175 |         {
176 |         }
177 | 
178 |         void Push(T const& element)
179 |         {
180 |             std::unique_lock<std::mutex> lock(m_mutex);
181 |             m_queue.push(std::move(element));
182 |         }
183 | 
184 |         bool Pop(T& function)
185 |         {
186 |             std::unique_lock<std::mutex> lock(m_mutex);
187 |             if (!m_queue.empty()) {
188 |                 function = std::move(m_queue.front());
189 |                 m_queue.pop();
190 |                 return true;
191 |             }
192 |             return false;
193 |         }
194 | 
195 |         int Size()
196 |         {
197 |             std::unique_lock<std::mutex> lock(m_mutex);
198 |             return m_queue.size();
199 |         }
200 | 
201 |     private:
202 |         std::queue<T> m_queue;
203 |         std::mutex m_mutex;
204 |     };
205 | 
206 |     class CoreHandler {
207 |     public:
208 |         CoreHandler(HWLocalThreadPool* const _parent, const unsigned _id,
209 |                     const ULONG_PTR& _processorMask)
210 |             : m_parent(_parent), m_id(_id), m_processorAffinityMask(_processorMask),
211 |               m_terminate(false), m_numChildThreads(_parent->m_numThreadsPerCore - 1)
212 |         {
213 |             if (m_numChildThreads > 0) {
214 |                 m_childThreads = new std::thread[m_numChildThreads];
215 |                 m_childThreadOnline = new bool[m_numChildThreads];
216 |                 std::unique_lock<std::mutex> lock(m_threadMutex);
217 |                 for (int i = 0; i < m_numChildThreads; ++i) {
218 |                     m_childThreadOnline[i] = 0;
219 |                     m_childThreads[i] =
220 |                       std::thread(ThreadHandler(this, i, m_processorAffinityMask));
221 |                 }
222 |             }
223 |         }
224 | 
225 |         void WaitForChildThreads()
226 |         {
227 |             if (!m_childThreads || m_numChildThreads < 1)
228 |                 return;
229 | 
230 |             std::unique_lock<std::mutex> lock(m_threadMutex);
231 |             bool anyOnline = 1;
232 |             while (anyOnline) {
233 |                 anyOnline = 0;
234 |                 for (int i = 0; i < m_numChildThreads; ++i) {
235 |                     anyOnline |= m_childThreadOnline[i];
236 |                 }
237 |                 if (anyOnline) {
238 |                     m_threadToCoreNotifier.wait(lock);
239 |                 }
240 |             }
241 |         }
242 | 
243 |         void CloseChildThreads()
244 |         {
245 |             if (m_terminate || m_numChildThreads < 1)
246 |                 return;
247 | 
248 |             {
249 |                 std::unique_lock<std::mutex> lock(m_threadMutex);
250 |                 m_terminate = 1;
251 |                 m_coreToThreadNotifier.notify_all();
252 |             }
253 | 
254 |             /* Core closing threads */
255 |             for (int i = 0; i < m_numChildThreads; ++i) {
256 |                 if (m_childThreads[i].joinable()) {
257 |                     m_childThreads[i].join();
258 |                 }
259 |             }
260 | 
261 |             delete[] m_childThreads;
262 |             delete[] m_childThreadOnline;
263 |         }
264 | 
265 |         void operator()()
266 |         {
267 |             SetThreadAffinityMask(GetCurrentThread(), m_processorAffinityMask);
268 |             bool dequeued;
269 |             while (1) {
270 |                 {
271 |                     std::unique_lock<std::mutex> lock(m_parent->m_queueMutex);
272 |                     if (m_parent->m_terminate &&
273 |                         !(m_parent->m_waitToFinish && m_parent->m_queue.Size() > 0)) {
274 |                         break;
275 |                     }
276 |                     if (m_parent->m_queue.Size() == 0) {
277 |                         m_parent->m_queueToCoreNotifier.wait(lock);
278 |                     }
279 |                     dequeued = m_parent->m_queue.Pop(m_job);
280 |                 }
281 |                 if (dequeued) {
282 |                     m_ownJob = std::move(m_job[0]);
283 |                     if (m_numChildThreads < 1) {
284 |                         m_ownJob();
285 |                     } else {
286 |                         {
287 |                             std::unique_lock<std::mutex> lock(m_threadMutex);
288 |                             for (int i = 0; i < m_numChildThreads; ++i) {
289 |                                 m_childThreadOnline[i] = 1;
290 |                             }
291 |                             m_coreToThreadNotifier.notify_all();
292 |                         }
293 | 
294 |                         m_ownJob();
295 | 
296 |                         WaitForChildThreads();
297 |                     }
298 |                 }
299 |             }
300 |             CloseChildThreads();
301 |         }
302 | 
303 |         class ThreadHandler {
304 |         public:
305 |             ThreadHandler(CoreHandler* _parent, const unsigned _id,
306 |                           const ULONG_PTR& _processorAffinityMask)
307 |                 : m_parent(_parent), m_processorAffinityMask(_processorAffinityMask),
308 |                   m_id(_id), m_jobSlot(_id + 1)
309 |             {
310 |             }
311 | 
312 |             void operator()()
313 |             {
314 |                 SetThreadAffinityMask(GetCurrentThread(), m_processorAffinityMask);
315 |                 while (1) {
316 |                     {
317 |                         std::unique_lock<std::mutex> lock(m_parent->m_threadMutex);
318 |                         if (m_parent->m_terminate)
319 |                             break;
320 |                         if (!m_parent->m_childThreadOnline[m_id]) {
321 |                             m_parent->m_coreToThreadNotifier.wait(lock);
322 |                         }
323 |                     }
324 |                     bool online = 0;
325 |                     {
326 |                         std::unique_lock<std::mutex> lock(m_parent->m_threadMutex);
327 |                         online = m_parent->m_childThreadOnline[m_id];
328 |                     }
329 |                     if (online) {
330 |                         func = std::move(m_parent->m_job[m_jobSlot]);
331 |                         func();
332 |                         std::unique_lock<std::mutex> lock(m_parent->m_threadMutex);
333 |                         m_parent->m_childThreadOnline[m_id] = 0;
334 |                         m_parent->m_threadToCoreNotifier.notify_one();
335 |                     }
336 |                 }
337 |             }
338 | 
339 |             const unsigned m_id;
340 |             const unsigned m_jobSlot;
341 |             CoreHandler* m_parent;
342 |             ULONG_PTR m_processorAffinityMask;
343 |             std::function<void()> func;
344 |         };
345 | 
346 |         const unsigned m_id;
347 |         HWLocalThreadPool* const m_parent;
348 |         const ULONG_PTR m_processorAffinityMask;
349 |         const unsigned m_numChildThreads;
350 | 
351 |         std::thread* m_childThreads;
352 |         bool* m_childThreadOnline;
353 |         bool m_terminate;
354 | 
355 |         std::vector<std::function<void()>> m_job;
356 |         std::function<void()> m_ownJob;
357 | 
358 |         std::mutex m_threadMutex;
359 |         std::condition_variable m_coreToThreadNotifier;
360 |         std::condition_variable m_threadToCoreNotifier;
361 |     };
362 | 
363 | private:
364 |     unsigned m_numHWCores, m_numCoreHandlers, m_numThreadsPerCore;
365 |     CoreHandler* m_coreHandlers;
366 |     std::thread* m_coreHandlerThreads;
367 | 
368 |     Queue<std::vector<std::function<void()>>> m_queue;
369 | 
370 |     bool m_terminate, m_waitToFinish;
371 | 
372 |     std::mutex m_queueMutex;
373 |     std::condition_variable m_queueToCoreNotifier;
374 | };
375 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Multithreaded, Lightning fast Matrix-Matrix Multiplication
  2 | 
  3 | [See CHANGELOG](#changelog)
  4 | 
  5 | [What's next?](#whats-next)
  6 | 
  7 | In this project, I’ve implemented multiple methods for multiplying
  8 | matrices, and relevant utilities. My prime focuses were:
  9 | 
 10 |   - Cache locality, memory access patterns.
 11 | 
 12 |   - SIMD, hand optimized AVX/FMA intrinsics.
 13 |   
 14 |   - Software prefetching to maximize pipeline utilization.
 15 | 
 16 |   - Cache friendly multithreading.
 17 | 
 18 | I didn’t implement the Strassen’s algorithm, this code runs on O(N^3).
 19 | 
 20 | # How to run
 21 | 
 22 | **Requirements:**
 23 | * Windows platform
 24 | * 64-bit Intel CPU with AVX / FMA support
 25 | 
 26 | Currently, if you're looking to use this code, just copy and include CPUUtils.\* ThreadPool.h and copy the contents of MatrixMul.cpp except main() into a namespace, the code should be ready to compile as a header only library. Will tidy up the code into a proper library soon.
 27 | 
 28 | Note that this program relies on Intel specifix cpuid responses and intrinsics and Win32 API for logical-physical processor mapping and setting thread affinity.
 29 | 
 30 | Running the example code:  
 31 | Build the solution (see build options), then navigate to *x64\\Release\\* and run this command or call “run.bat”. If
 32 | you don’t have “tee” command, just delete the last part or install
 33 | GnuWin32 CoreUtils.
 34 | 
 35 | ``` bash
 36 | for /l %x in (1, 1, 100) do echo %x && (MatrixGenerator.exe && printf "Generated valid output. Testing...\n" && MatrixMult.exe matrixA.bin matrixB.bin matrixAB-out.bin && printf \n\n ) | tee -a out.txt
 37 | ```
 38 | 
 39 | # Benchmarks
 40 | 
 41 | On my machine (6 core i7-8700K), I’ve compared my implementation against:
 42 | 
 43 | * Eigen library (with all the compiler optimizations turned on)
 44 |     * I've tested both Eigen's own implementation and Eigen compiled with MKL+TBB backend, runtime analysis shows that the benchmark indeed uses MKL kernel for matrix multiplication and Eigen doesn't introduce any overheads.
 45 | * Multithreaded python-numpy which uses C/C++ backend and Intel MKL BLAS
 46 | library. The code can be found under the Benchmarks folder, however the graph below doesn't include it as it was consistently slower than Eigen(MKL+TBB)
 47 | 
 48 | ## Comparison
 49 | 
 50 | Current implementation runs identically or slightly faster than Eigen (MKL+TBB) for all test cases (tested up to N=15K)! Intel Advisor and VTune clearly shows that MKL kernel *avx2_dgemm_kernel_0* is used and no abnormal overheads are present.
 51 | 
 52 | ![benchmark graph](https://raw.githubusercontent.com/talhasaruhan/cpp-matmul/master/Benchmark1.png)
 53 | 
 54 | ## Multithreading utilities ([ThreadPool.h](https://github.com/talhasaruhan/cpp-matmul/blob/master/MatrixMult/ThreadPool.h))
 55 | 
 56 | ``` c++
 57 | Namespace CPUUtil,
 58 | HWLocalThreadPool(NumOfCoresToUse, NumThreadsPerCore)  
 59 | ```
 60 | 
 61 | CPUUtil namespace has utility functions for querying runtime system for logical-physical processor mapping, cache sizes, cache line size, hyperthreading, AVX/FMA instruction set support and few more. 
 62 | 
 63 | I’ve also implemented a hardware local thread pool to handle jobs for multithreaded
 64 | *MTMatMul* function. The pool runs every thread corresponding to a job
 65 | on the same physical core. Idea is that, on hyperthreaded systems such
 66 | as mine, 2 threads that work on contiguous parts of memory should live
 67 | on the same core and share the same L1 and L2 cache.
 68 | 
 69 |   - Each job is described as an array of N functions. (N=2)
 70 | 
 71 |   - For each job, N threads (that were already created) are assigned respective
 72 |     functions.
 73 | 
 74 |   - For a given job, all threads are guaranteed to be on the same
 75 |     physical core.
 76 | 
 77 |   - No two threads from different jobs are allowed on the same physical
 78 |     core.
 79 | 
 80 | ## MSVC2017 Build options (over default x64 Release build settings)
 81 | 
 82 |   - Maximum optimization: /O2
 83 | 
 84 |   - Favor fast code /Ot
 85 | 
 86 |   - Enable function level linking: /Gy
 87 | 
 88 |   - Enable enhanced instruction set: /arch:AVX2
 89 | 
 90 |   - Floating point model: /fp:fast
 91 | 
 92 |   - Language: /std:c++17 (for several “if constexpr”s, and one std::lcm. otherwise can be
 93 |     compiled with C++ 11)
 94 | 
 95 | # What's next?
 96 | * ~~Still a factor of 2 to achieve MKL performance.~~ Achieved and surpassed Eigen(MKL+TBB) performance for most test cases N<15K. Test and optimize for larger matrices.
 97 | * Right now, when the prefetch switches are enabled, instruction retirement rate is about 88%, and the program is neither front-end nor back-end bound, it has excellent pipeline utilization. When the switches are disabled, the retirement rate drops to about 50%, and the program is heavily memory bound, pipelines are heavily stalled due to these bounds. However, on my current system (i7 8700K), binary without prefetching actually computes the output significantly faster (15%). I think this behaviour will heavily rely on the specific CPU, its cache size and performance. Try this on other hardware with different cache performances and varying matrix sizes.
 98 | * Wrap the functionality in a replicable and distributable framework that's easy to use.
 99 | 
100 | # Changelog
101 | 
102 | **Note:** Debugging builds will have arguments pre-set on the MatrixMul.cpp, you can ignore or revert those to accept argument from command line.
103 | 
104 | ### 27/11/2018
105 | * Cleaned up the code. Split some behaviours into seperate functions.
106 | * Implemented runtime detection for best block size parameters for the runtime system.
107 | * Tuned software prefetching, now we do multiple smaller prefetches in between arithmetic operations and with a stride between prefetches.
108 | * More arithmetically dense inner loop. Instead of 3x3 blocks, do 4x3 blocks (3b + 12c + 1 temporary a == 16 registers used), 7 loads, 12 arithmetic operations.
109 | * HWLocalThreadPool takes number of cores and threads per core as contructor arguments and is not templated anymore. It never should have been.
110 | * Renamed QueryHWCores namespace to CPUUtils and extended it to support querying cache sizes, HTT/AVX/FMA support etc. using \_\_cpuid.
111 | 
112 | ### 15/11/2018
113 | * Implemented **one more level of blocking**, first block holds data in L3 while the second holds the data in L2. To avoid the "job" overhead in thread pool system and to allow for explicit software prefetching, threads groups handle the highest level of blocks. (If the job was issued on lower level blocks, the threads need explicit syncing so that they only issue prefetch command once per L3 block.)
114 | * Implemented **software prefetching**. Now while an L3 block is being computed, next one is loaded into the memory in an asynchronous manner. May implement a similar feature for L2 level blocks later on.
115 | * **Removed** all but one of the *MMHelper_MultBlocks* implementations.
116 | * **Converted** AVX multiply and add intrinsics to **fused multiply add intrinsics** from FMA set.
117 | * **Now the MultBlocks use the loaded __m256 vectors as long as possible without unloading and loading a new one.** Just like we keep same values in cache and use them as much as possible without unloading, this is the the same idea applied to **YMM registers**. This increased Arithmetic Intensity (FLOP/L1 Transferred Bytes) metric from 0.25 to 0.67, speeding up the entire matrix multiplication by the same ratio.
118 | * Now fully integrated **VTune** into my workflow to analyze the application.
119 | 
120 | ### 13/11/2018
121 | <details><summary><b>Long and detailed work journal, click to expand</b></summary>
122 | <p>
123 | <ul>
124 | <li>Added a couple of vector sum implementations in benchmark project to compare different intrinsic approaches. The aim is to achieve maximum throughput with ILP minded design. However compiler optimizes away different ways in which I try to maximize the throughput for my own specific CPU architecture.</li>
125 | <li>In order to address this issue, I wrote another benchmark with inline assembly and compiled it with GCC (as MSVC doesn't support inline assembly in x64 architecture). First of all, I tested GCC's behaviour with intrinsics and found it to be same as MSVC's for our purposes. Having shown that, I've written volatile inline assembly to force compiler to use my implementation. The tests showed that the compiler optimized the intrinsics to almost the same level when the optimizations are enabled. But compiler optimized versions, and my ASM code, is still not fast enough to compete with BLAS packages. So I'm doing something wrong in the first place and writing ASM is not the answer.</li>
126 | <li>Benchmarked auto vectorization, naive intrinsics and other 2 intrinsic based block multiplication implementations, last 2 methods are about 15% faster than naive intrinsics and auto vectorized code. But arithmetic intensity (FLOPs / memory accesses) is still quite low.</li>
127 | <li>Started analyzing the bottlenecks further using **Intel's VTune and Advisor**. It now became apparent that while I was getting similar results from different approaches, each had **different bottlenecks** which at first I couldn't see. So with this detailed information I should be able to address those bottlenecks.</li>
128 | <li>Added another intrinsic based block multiplication method, changed a few implementations to use **FMA** intructions rather than seperate multiply-adds, to achieve higher throughput.</li>
129 | <li>When profiling my program I noticed that small block sizes that can fit into L2 cache yielded a lot of L3 misses and large blocks that utilized L3 well and cut down the DRAM fetches, ran into L2 misses. So applying the idea that led to blocking to begin with, I will implement **one more level of blocking** to better utilize multiple layers of cache.</li>
130 | </ul>
131 | </p>
132 | </details>
133 | 
134 | ### 09/11/2018
135 | * **Fixed memory leaks!**
136 | 
137 | <details><summary><b>Screenshot of memory usage analysis</b></summary>
138 | <img src="https://user-images.githubusercontent.com/15991519/48242727-a0d70300-e3ed-11e8-80e9-01954f2ec6b9.PNG"/>
139 | </details>
140 | 
141 | (This is  the heap profile of the program after running C1 = AB, freeing C1, then running C2=AB and freeing C2. As can be seen here, all the previously leaked mess (packed tasks, function pointers, CoreHandler member arrays etc. ) is now cleaned up nicely. Note: int[] is the static CPU core to logical processor map,)
142 | 
143 | * **Properly called destructors** where CoreHandler objects are created using placement new into a malloc'ed buffer.
144 | * **Freed BT.mat** (transpose of B) in the methods that use it to convert the problem into row-row dot product.
145 | * ~~Changed Add function s.t it accepts std::shared_ptr<std::function<void()>[]>, this is only temporary.~~
146 | * **Changed the Add() semantics**, now Add function accepts a std::vector<std::function<void()>>. Preferred way of using Add() function now is with initializer lists:
147 | 
148 | ```
149 | tp.Add({
150 |     HWLocalThreadPool<>::WrapFunc(MMHelper_MultBlocks,
151 |         matData, subX, matA.height - rowC, rowC, colC, matA, matB, matBT) ,
152 |     HWLocalThreadPool<>::WrapFunc(MMHelper_MultBlocks,
153 |         matData, subX, matA.height - rowC, rowC, colC + subX, matA, matB, matBT)
154 | });
155 | ```
156 | * Added Eigen benchmarks
157 | * Implemented MatMul which should be the general function exposed to outside. It simply selects betwen *MTMatMul* and *ST_TransposedBMatMul* depending on the sizes of the matrices. Current impl.: ```A.height*A.width*A.width*B.width < K : ST_TransposedBMatMul o.w : MTMatMul```
158 | 
159 | 


--------------------------------------------------------------------------------
/benchmark.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/talhasaruhan/cpp-matmul/e1ef1edf935d5af6d79de15b127d1e8ad13f284c/benchmark.xlsx


--------------------------------------------------------------------------------
/run.bat:
--------------------------------------------------------------------------------
1 | cd x64/Release/ && run.bat


--------------------------------------------------------------------------------