├── .clang-format ├── .coveragerc ├── .github ├── dependabot.yml └── workflows │ ├── ExoBLAS.yml │ ├── gemmini.yml │ ├── linting.yml │ ├── main.yml │ └── packaging.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── apps ├── CMakeLists.txt ├── CMakePresets.json ├── aarch64 │ ├── filter1D │ │ ├── .gitignore │ │ ├── filter1D_cursor_forwarding.py │ │ ├── filter1D_old.py │ │ ├── filter1D_week1.py │ │ └── filter1D_week2.py │ └── sgemm │ │ ├── CMakeLists.txt │ │ ├── build.sh │ │ ├── demo_stage.py │ │ ├── naive_sgemm.cpp │ │ ├── naive_sgemm.h │ │ ├── sgemm.py │ │ └── test.cpp ├── build.sh ├── gemmini │ ├── CMakeLists.txt │ ├── cmake │ │ └── gemmini.cmake │ ├── src │ │ ├── CMakeLists.txt │ │ ├── exo │ │ │ ├── conv.py │ │ │ └── matmul.py │ │ └── platform │ │ │ ├── CMakeLists.txt │ │ │ ├── gemm_acc_malloc.c │ │ │ ├── gemm_acc_malloc.h │ │ │ ├── gemm_malloc.c │ │ │ └── gemm_malloc.h │ └── test │ │ ├── CMakeLists.txt │ │ ├── bare-metal │ │ ├── CMakeLists.txt │ │ └── shims │ │ │ ├── gemmini.c │ │ │ ├── include │ │ │ ├── gemmini.h.in │ │ │ ├── gemmini_counter.h.in │ │ │ ├── gemmini_params.h.in │ │ │ └── gemmini_testutils.h.in │ │ │ └── rocc-software │ │ │ └── src │ │ │ └── xcustom.h.in │ │ ├── helpers.c │ │ ├── helpers.h │ │ ├── run_conv.c │ │ └── run_matmul.c ├── plot.py └── x86 │ ├── CMakeLists.txt │ ├── conv │ ├── CMakeLists.txt │ ├── Halide_Conv_Schedule.txt │ ├── bench_conv.cpp │ ├── conv.py │ ├── conv_instance.hpp │ ├── exo_conv.cpp │ ├── exo_conv.hpp │ ├── halide_conv.cpp │ ├── halide_conv.hpp │ ├── halide_conv_gen.cpp │ ├── onednn_conv.cpp │ ├── onednn_conv.hpp │ └── test_conv.cpp │ ├── halide │ ├── blur │ │ ├── .gitignore │ │ ├── README.md │ │ ├── blur.py │ │ ├── gray.png │ │ ├── gray_scaled.png │ │ └── main.cpp │ └── unsharp │ │ └── unsharp.py │ ├── resnet │ └── CMakeLists.txt │ ├── sgemm │ ├── CMakeLists.txt │ ├── alex_sgemm.cpp │ ├── alex_sgemm.h │ ├── bench_sgemm.cpp │ ├── sgemm.py │ └── test.cpp │ └── ssyrk │ ├── CMakeLists.txt │ └── bench_ssyrk.cpp ├── dev-requirements.txt ├── docs ├── Cursors.md ├── Design.md ├── Imports.md ├── Metaprogramming.md ├── Procedures.md ├── README.md ├── System.md ├── externs.md ├── images │ └── system-overview.png ├── inspection.md ├── instructions.md ├── memories.md ├── object_code.md └── primitives │ ├── backend_ops.md │ ├── buffer_ops.md │ ├── config_ops.md │ ├── loop_ops.md │ ├── other_ops.md │ └── subproc_ops.md ├── examples ├── README.md ├── avx2_matmul │ ├── Makefile │ ├── README.md │ ├── main.c │ └── x86_matmul.py ├── cursors │ ├── .gitignore │ ├── README.md │ └── cursors.py ├── quiz1 │ ├── .gitignore │ ├── README.md │ └── quiz1.py ├── quiz2 │ ├── .gitignore │ ├── README.md │ └── quiz2.py ├── quiz3 │ ├── .gitignore │ ├── README.md │ └── quiz3.py └── rvm_conv1d │ ├── .gitignore │ ├── Makefile │ ├── README.md │ ├── conv1Di32.h │ ├── exo │ ├── .gitignore │ └── conv1d.py │ ├── gen_stimuli.py │ └── main.c ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py ├── src └── exo │ ├── API.py │ ├── API_cursors.py │ ├── API_scheduling.py │ ├── API_types.py │ ├── __init__.py │ ├── backend │ ├── LoopIR_compiler.py │ ├── __init__.py │ ├── mem_analysis.py │ ├── parallel_analysis.py │ ├── prec_analysis.py │ └── win_analysis.py │ ├── cmake │ ├── AddExoLibrary.cmake │ ├── ExoConfig.cmake │ └── ExoConfigVersion.cmake │ ├── core │ ├── LoopIR.py │ ├── LoopIR_pprint.py │ ├── __init__.py │ ├── configs.py │ ├── extern.py │ ├── internal_cursors.py │ ├── memory.py │ ├── prelude.py │ └── proc_eqv.py │ ├── frontend │ ├── __init__.py │ ├── boundscheck.py │ ├── parse_fragment.py │ ├── pattern_match.py │ ├── pyparser.py │ ├── syntax.py │ └── typecheck.py │ ├── libs │ ├── __init__.py │ ├── custom_malloc.c │ ├── custom_malloc.h │ ├── exo_arm_sve.h │ ├── externs.py │ ├── gemm_acc_malloc.c │ ├── gemm_acc_malloc.h │ ├── gemm_malloc.c │ ├── gemm_malloc.h │ └── memories.py │ ├── main.py │ ├── platforms │ ├── __init__.py │ ├── gemmini.py │ ├── neon.py │ ├── rvv.py │ ├── sve_vla.py │ ├── sve_vls.py │ └── x86.py │ ├── rewrite │ ├── LoopIR_scheduling.py │ ├── LoopIR_unification.py │ ├── __init__.py │ ├── analysis_simplify.py │ ├── new_analysis_core.py │ ├── new_eff.py │ └── range_analysis.py │ └── stdlib │ ├── __init__.py │ ├── analysis.py │ ├── halide_scheduling_ops.py │ ├── higher_order.py │ ├── inspection.py │ ├── range_analysis.py │ ├── rc_wrappers.py │ ├── scheduling.py │ └── stdlib.py ├── tests ├── __init__.py ├── amx │ ├── __init__.py │ ├── amx.py │ ├── amx_experiment.c │ ├── amx_experiment.cpp │ ├── harness_amx.py │ └── test_amx_instr.py ├── asplos25 │ ├── gemmini_schedules.py │ ├── test_gemmini_matmul_new.py │ ├── test_gemmini_matmul_old.py │ └── test_higher_order.py ├── conftest.py ├── golden │ ├── asplos25 │ │ ├── test_gemmini_matmul_new │ │ │ └── test_matmul.txt │ │ ├── test_gemmini_matmul_old │ │ │ └── test_matmul.txt │ │ └── test_higher_order │ │ │ ├── test_lrn.txt │ │ │ ├── test_reduce.txt │ │ │ ├── test_reframe.txt │ │ │ └── test_repeat.txt │ ├── test_apps │ │ ├── test_blur.txt │ │ ├── test_gemmini_conv.txt │ │ ├── test_gemmini_matmul.txt │ │ ├── test_neon_sgemm.txt │ │ ├── test_unsharp.txt │ │ ├── test_x86_conv.txt │ │ └── test_x86_sgemm.txt │ ├── test_codegen │ │ ├── test_CIR_USub.txt │ │ ├── test_coercion_to_f16.txt │ │ ├── test_coercion_to_f32.txt │ │ ├── test_coercion_to_f64.txt │ │ ├── test_coercion_to_i32.txt │ │ ├── test_coercion_to_i8.txt │ │ ├── test_coercion_to_index.txt │ │ ├── test_coercion_to_ui16.txt │ │ ├── test_coercion_to_ui8.txt │ │ ├── test_const_buffer_parameters.txt │ │ ├── test_const_local_buffer.txt │ │ ├── test_const_local_window.txt │ │ ├── test_memcpy_instr.txt │ │ ├── test_no_exo_floor_div_after_divide_loop_with_guard.txt │ │ ├── test_no_exo_floor_div_triangular_access.txt │ │ ├── test_pragma_parallel_loop.txt │ │ └── test_target_another_exo_library.txt │ ├── test_config │ │ ├── test_basic_config.txt │ │ ├── test_config_bind.txt │ │ ├── test_config_fission.txt │ │ ├── test_config_write.txt │ │ ├── test_ld.txt │ │ ├── test_loop_complex_guards.txt │ │ ├── test_stride_with_config.txt │ │ ├── test_write_all_control.txt │ │ ├── test_write_loop_builtin.txt │ │ └── test_write_loop_const_number.txt │ ├── test_cursors │ │ ├── test_arg_cursor.txt │ │ ├── test_basic_forwarding.txt │ │ ├── test_basic_forwarding2.txt │ │ ├── test_basic_forwarding3.txt │ │ ├── test_bind_expr_forwarding.txt │ │ ├── test_cursor_print.txt │ │ ├── test_gap_forwarding.txt │ │ ├── test_get_enclosing_loop_by_name.txt │ │ ├── test_get_stmt_within_scope.txt │ │ ├── test_match_parent.txt │ │ ├── test_match_parent_2.txt │ │ ├── test_reorder_loops_forwarding.txt │ │ ├── test_simplify_forwarding.txt │ │ ├── test_unroll_buffer_forwarding.txt │ │ └── test_vectorize_forwarding.txt │ ├── test_examples │ │ ├── test_avx2_matmul.txt │ │ ├── test_cursors.txt │ │ ├── test_quiz1.txt │ │ ├── test_quiz3.txt │ │ └── test_rvm_conv1d.txt │ ├── test_externs │ │ ├── test_expf.txt │ │ ├── test_extern_find.txt │ │ ├── test_fmaxf.txt │ │ ├── test_relu.txt │ │ ├── test_relu2.txt │ │ ├── test_relu3.txt │ │ ├── test_relu4.txt │ │ ├── test_select.txt │ │ ├── test_sigmoid.txt │ │ ├── test_sin.txt │ │ └── test_sqrt.txt │ ├── test_halide_ops │ │ ├── test_compute_at_with_prologue.txt │ │ ├── test_schedule_blur1d.txt │ │ ├── test_schedule_blur2d.txt │ │ └── test_schedule_tiled_blur2d.txt │ ├── test_im2col │ │ └── test_im2col.txt │ ├── test_internal_cursors │ │ ├── test_block_delete.txt │ │ ├── test_block_delete_whole_block.txt │ │ ├── test_block_replace.txt │ │ ├── test_block_replace_forwarding_for_blocks.txt │ │ ├── test_cursor_pretty_print_blocks.txt │ │ ├── test_cursor_pretty_print_gaps.txt │ │ ├── test_cursor_pretty_print_nodes.txt │ │ ├── test_cursor_replace_expr.txt │ │ ├── test_cursor_replace_expr_deep.txt │ │ ├── test_delete_forwarding_for_blocks.txt │ │ ├── test_double_insert_forwarding.txt │ │ ├── test_gap_insert_pass.txt │ │ ├── test_insert_forwarding_for_blocks.txt │ │ ├── test_insert_root_end.txt │ │ ├── test_insert_root_front.txt │ │ ├── test_move_block.txt │ │ ├── test_move_forward_diff_scopes_1.txt │ │ ├── test_move_forward_if_orelse.txt │ │ ├── test_move_forwarding_for_blocks.txt │ │ ├── test_move_forwarding_for_blocks_gap_after.txt │ │ ├── test_node_replace.txt │ │ ├── test_wrap_block.txt │ │ └── test_wrap_forwarding_for_blocks.txt │ ├── test_metaprogramming │ │ ├── test_capture_nested_quote.txt │ │ ├── test_captured_closure.txt │ │ ├── test_conditional.txt │ │ ├── test_constant_lifting.txt │ │ ├── test_eval_expr_in_mem.txt │ │ ├── test_implicit_lhs_unquote.txt │ │ ├── test_local_externs.txt │ │ ├── test_proc_shadowing.txt │ │ ├── test_quote_complex_expr.txt │ │ ├── test_quote_elision.txt │ │ ├── test_scope_collision1.txt │ │ ├── test_scope_collision2.txt │ │ ├── test_scope_nesting.txt │ │ ├── test_scoping.txt │ │ ├── test_statement_assignment.txt │ │ ├── test_statements.txt │ │ ├── test_type_params.txt │ │ ├── test_type_quote_elision.txt │ │ ├── test_unary_ops.txt │ │ ├── test_unquote_elision.txt │ │ ├── test_unquote_in_slice.txt │ │ ├── test_unquote_index_tuple.txt │ │ ├── test_unquote_slice_object1.txt │ │ └── test_unrolling.txt │ ├── test_neon │ │ ├── test_gen_neon_simple_math_scheduling.txt │ │ ├── test_gen_neon_vfmla.txt │ │ └── test_gen_neon_vfmla_f16.txt │ ├── test_new_eff │ │ ├── test_alloc_success.txt │ │ ├── test_delete_config_basic.txt │ │ ├── test_delete_config_bc_redundant.txt │ │ ├── test_delete_config_bc_shadow.txt │ │ ├── test_delete_config_subproc_basic.txt │ │ ├── test_reorder_loops_4pt_stencil_succeed.txt │ │ ├── test_reorder_loops_requiring_seq.txt │ │ └── test_reorder_loops_success.txt │ ├── test_parallel │ │ └── test_pragma_parallel_loop.txt │ ├── test_precision │ │ ├── test_good_prec2.txt │ │ └── test_good_ui8_prec.txt │ ├── test_reflection │ │ └── test_show_effect.txt │ ├── test_rvv │ │ └── test_gen_rvv.txt │ ├── test_schedules │ │ ├── test_add_loop.txt │ │ ├── test_add_loop1.txt │ │ ├── test_add_loop2.txt │ │ ├── test_add_loop3.txt │ │ ├── test_bind_cursor_arg.txt │ │ ├── test_bind_expr_cse.txt │ │ ├── test_bind_expr_cse_2.txt │ │ ├── test_bind_expr_diff_indices.txt │ │ ├── test_bind_lhs.txt │ │ ├── test_bool_partial_eval.txt │ │ ├── test_commute.txt │ │ ├── test_commute3.txt │ │ ├── test_cut_loop2.txt │ │ ├── test_cut_loop_at_hi.txt │ │ ├── test_cut_loop_at_lo.txt │ │ ├── test_cut_loop_by_expr.txt │ │ ├── test_cut_loop_by_expr1.txt │ │ ├── test_cut_loop_by_expr2.txt │ │ ├── test_cut_loop_nonzero_lo.txt │ │ ├── test_cut_loop_nonzero_lo2.txt │ │ ├── test_cut_loop_syrk.txt │ │ ├── test_cut_then_shift_loop.txt │ │ ├── test_delete_buffer.txt │ │ ├── test_delete_pass.txt │ │ ├── test_delete_pass_1.txt │ │ ├── test_divide_dim_1.txt │ │ ├── test_divide_dim_2.txt │ │ ├── test_divide_dim_3.txt │ │ ├── test_divide_loop_by_1_cut.txt │ │ ├── test_divide_loop_by_1_guard.txt │ │ ├── test_divide_loop_cut_and_guard.txt │ │ ├── test_divide_loop_perfect.txt │ │ ├── test_divide_loop_perfect2.txt │ │ ├── test_divide_loop_perfect3.txt │ │ ├── test_divide_with_recompute.txt │ │ ├── test_double_fission.txt │ │ ├── test_eliminate_dead_code.txt │ │ ├── test_eliminate_dead_code2.txt │ │ ├── test_eliminate_dead_code3.txt │ │ ├── test_eliminate_dead_code4.txt │ │ ├── test_eliminate_dead_code7.txt │ │ ├── test_eliminate_dead_code8.txt │ │ ├── test_eliminate_dead_code9.txt │ │ ├── test_expand_dim.txt │ │ ├── test_expand_dim3.txt │ │ ├── test_expand_dim4.txt │ │ ├── test_expand_dim5.txt │ │ ├── test_expand_dim6.txt │ │ ├── test_extract_subproc.txt │ │ ├── test_extract_subproc2.txt │ │ ├── test_extract_subproc3.txt │ │ ├── test_extract_subproc4.txt │ │ ├── test_extract_subproc5.txt │ │ ├── test_extract_subproc6.txt │ │ ├── test_extract_subproc7.txt │ │ ├── test_fission.txt │ │ ├── test_fission_after_simple.txt │ │ ├── test_fold_buffer_blur.txt │ │ ├── test_fold_buffer_if_stmt.txt │ │ ├── test_fold_buffer_loop_in_context.txt │ │ ├── test_fold_buffer_loop_simple.txt │ │ ├── test_fold_buffer_sequential_stmts.txt │ │ ├── test_fold_buffer_unsharp.txt │ │ ├── test_fold_buffer_within_stmt.txt │ │ ├── test_fold_into_reduce_1.txt │ │ ├── test_fold_into_reduce_2.txt │ │ ├── test_formatted_expr_1.txt │ │ ├── test_formatted_expr_2.txt │ │ ├── test_formatted_expr_3.txt │ │ ├── test_fuse_if.txt │ │ ├── test_fuse_loop.txt │ │ ├── test_fuse_loop2.txt │ │ ├── test_fuse_loop_commute_config.txt │ │ ├── test_inline_assign.txt │ │ ├── test_inline_assign_scalar.txt │ │ ├── test_inline_window.txt │ │ ├── test_inline_window2.txt │ │ ├── test_inline_window3.txt │ │ ├── test_insert_noop_call.txt │ │ ├── test_join_loops_body_match.txt │ │ ├── test_join_loops_equiv_but_diff_bounds.txt │ │ ├── test_left_reassociate_expr_1.txt │ │ ├── test_left_reassociate_expr_2.txt │ │ ├── test_lift.txt │ │ ├── test_lift_alloc_simple.txt │ │ ├── test_lift_alloc_simple2.txt │ │ ├── test_lift_alloc_simple3.txt │ │ ├── test_lift_alloc_simple_empty_body.txt │ │ ├── test_lift_if_halfway.txt │ │ ├── test_lift_if_in_else_branch_of_parent.txt │ │ ├── test_lift_if_in_full_nest.txt │ │ ├── test_lift_if_middle.txt │ │ ├── test_lift_if_past_for.txt │ │ ├── test_lift_if_past_if.txt │ │ ├── test_lift_if_past_if_then_for.txt │ │ ├── test_lift_if_with_else_past_if.txt │ │ ├── test_lift_if_with_else_past_if_with_else.txt │ │ ├── test_lift_if_with_pass_body.txt │ │ ├── test_lift_if_with_pass_body_and_else.txt │ │ ├── test_lift_reduce_constant_1.txt │ │ ├── test_lift_reduce_constant_2.txt │ │ ├── test_lift_reduce_constant_3.txt │ │ ├── test_lift_scope.txt │ │ ├── test_mem_aware_replace.txt │ │ ├── test_merge_writes_all_4_cases.txt │ │ ├── test_merge_writes_array_indexing.txt │ │ ├── test_merge_writes_consecutively.txt │ │ ├── test_merge_writes_type_check.txt │ │ ├── test_mult_dim_1.txt │ │ ├── test_new_expr_multi_vars.txt │ │ ├── test_old_lift_alloc_config.txt │ │ ├── test_parallelize_loop.txt │ │ ├── test_pattern_matching_id_in_scheduling_ops.txt │ │ ├── test_product_loop.txt │ │ ├── test_product_loop2.txt │ │ ├── test_product_loop4.txt │ │ ├── test_product_loop5.txt │ │ ├── test_rearrange_dim.txt │ │ ├── test_rearrange_dim_2.txt │ │ ├── test_reassociate_then_fold.txt │ │ ├── test_remove_loop.txt │ │ ├── test_remove_loop_deterministic.txt │ │ ├── test_reorder_stmts.txt │ │ ├── test_replace_all_arch.txt │ │ ├── test_replace_all_length_mismatch.txt │ │ ├── test_replace_all_unambiguous.txt │ │ ├── test_replace_once.txt │ │ ├── test_resize_dim.txt │ │ ├── test_resize_dim_2.txt │ │ ├── test_resize_dim_3.txt │ │ ├── test_resize_dim_4.txt │ │ ├── test_resize_dim_5.txt │ │ ├── test_reuse_buffer.txt │ │ ├── test_reuse_buffer2.txt │ │ ├── test_rewrite_expr.txt │ │ ├── test_rewrite_expr_2.txt │ │ ├── test_set_precision_api_type.txt │ │ ├── test_shift_loop.txt │ │ ├── test_shift_loop_by_expr.txt │ │ ├── test_shift_loop_nonzero_lo.txt │ │ ├── test_simple_bind_expr.txt │ │ ├── test_simple_divide_loop.txt │ │ ├── test_simple_fission.txt │ │ ├── test_simple_inline.txt │ │ ├── test_simple_lift_alloc.txt │ │ ├── test_simple_partial_eval.txt │ │ ├── test_simple_reorder.txt │ │ ├── test_simple_reorder2.txt │ │ ├── test_simple_typ_and_mem.txt │ │ ├── test_simple_typ_and_mem_2.txt │ │ ├── test_simple_unroll.txt │ │ ├── test_simple_unroll2.txt │ │ ├── test_simplify.txt │ │ ├── test_simplify2.txt │ │ ├── test_simplify3.txt │ │ ├── test_simplify4.txt │ │ ├── test_simplify_div_mod_staging.txt │ │ ├── test_simplify_index_div.txt │ │ ├── test_simplify_index_div1.txt │ │ ├── test_simplify_index_div2.txt │ │ ├── test_simplify_index_div3.txt │ │ ├── test_simplify_index_div4.txt │ │ ├── test_simplify_index_div5.txt │ │ ├── test_simplify_index_div6.txt │ │ ├── test_simplify_index_div_fail.txt │ │ ├── test_simplify_index_div_fail1.txt │ │ ├── test_simplify_index_div_fail2.txt │ │ ├── test_simplify_index_mod.txt │ │ ├── test_simplify_index_mod1.txt │ │ ├── test_simplify_index_mod2.txt │ │ ├── test_simplify_index_mod3.txt │ │ ├── test_simplify_index_mod4.txt │ │ ├── test_simplify_index_mod5.txt │ │ ├── test_simplify_index_nested_div_mod.txt │ │ ├── test_simplify_logical.txt │ │ ├── test_simplify_loop_bounds.txt │ │ ├── test_simplify_nested_div.txt │ │ ├── test_simplify_nested_div_2.txt │ │ ├── test_simplify_with_window_stmts.txt │ │ ├── test_sink_alloc_simple_for_loop.txt │ │ ├── test_sink_alloc_simple_if_stmt.txt │ │ ├── test_sink_alloc_when_if_has_else.txt │ │ ├── test_specialize.txt │ │ ├── test_specialize_blocks.txt │ │ ├── test_specialize_sizes.txt │ │ ├── test_split_write.txt │ │ ├── test_stage_mem.txt │ │ ├── test_stage_mem_accum.txt │ │ ├── test_stage_mem_accum2.txt │ │ ├── test_stage_mem_assign.txt │ │ ├── test_stage_mem_assign2.txt │ │ ├── test_stage_mem_asum.txt │ │ ├── test_stage_mem_okay.txt │ │ ├── test_stage_mem_out_of_bound_block.txt │ │ ├── test_stage_mem_out_of_bound_point.txt │ │ ├── test_stage_mem_out_of_bound_reduction_accum.txt │ │ ├── test_stage_mem_out_of_bounds_load_1D.txt │ │ ├── test_stage_mem_out_of_bounds_load_2D_one_cond.txt │ │ ├── test_stage_mem_out_of_bounds_load_2D_two_conds.txt │ │ ├── test_stage_mem_out_of_bounds_reduction.txt │ │ ├── test_stage_mem_out_of_bounds_store_1D.txt │ │ ├── test_stage_mem_point.txt │ │ ├── test_stage_mem_recursive.txt │ │ ├── test_stage_mem_reduce.txt │ │ ├── test_stage_mem_reduce2.txt │ │ ├── test_stage_mem_twice.txt │ │ ├── test_transpose.txt │ │ ├── test_unify1.txt │ │ ├── test_unify10.txt │ │ ├── test_unify11.txt │ │ ├── test_unify12.txt │ │ ├── test_unify2.txt │ │ ├── test_unify3.txt │ │ ├── test_unify4.txt │ │ ├── test_unify5.txt │ │ ├── test_unify6.txt │ │ ├── test_unify7.txt │ │ ├── test_unify8.txt │ │ ├── test_unify9.txt │ │ ├── test_unroll_buffer.txt │ │ ├── test_unroll_buffer1.txt │ │ └── test_unroll_buffer6.txt │ ├── test_sve_vla │ │ ├── test_compile_sve_vla_svmla.txt │ │ └── test_gen_sve_vla_svmla.txt │ ├── test_sve_vls │ │ ├── test_compile_sve_vls_svmla.txt │ │ └── test_gen_sve_vls_svmla.txt │ ├── test_uast │ │ ├── test_alloc_nest.txt │ │ ├── test_conv1d.txt │ │ └── test_unary_neg.txt │ ├── test_window │ │ ├── test_normalize.txt │ │ ├── test_stride_assert.txt │ │ ├── test_window.txt │ │ └── test_window_stmt.txt │ └── test_x86 │ │ ├── test_avx2_divide_by_3.txt │ │ └── test_gen_avx2_simple_math_scheduling.txt ├── input.png ├── test_apps.py ├── test_bounds.py ├── test_codegen.py ├── test_config.py ├── test_cursors.py ├── test_error_reporting.py ├── test_examples.py ├── test_externs.py ├── test_forwarding.py ├── test_halide_ops.py ├── test_im2col.py ├── test_internal_cursors.py ├── test_metaprogramming.py ├── test_neon.py ├── test_new_eff.py ├── test_parallel.py ├── test_precision.py ├── test_range_analysis.py ├── test_rvv.py ├── test_schedules.py ├── test_sve_vla.py ├── test_sve_vls.py ├── test_typecheck.py ├── test_uast.py ├── test_window.py ├── test_winograd.py ├── test_x86.py └── winograd.py └── tox.ini /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | BasedOnStyle: LLVM 3 | AlignAfterOpenBracket: DontAlign 4 | SpacesBeforeTrailingComments: 2 5 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 6 | AlignAfterOpenBracket: DontAlign 7 | AlwaysBreakTemplateDeclarations: Yes 8 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [report] 2 | exclude_lines = 3 | @proc 4 | @instr 5 | @config 6 | pragma: no cover 7 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | -------------------------------------------------------------------------------- /.github/workflows/ExoBLAS.yml: -------------------------------------------------------------------------------- 1 | name: ExoBLAS 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | build-test: 11 | uses: exo-lang/ExoBLAS/.github/workflows/build-test.yml@main 12 | with: 13 | exo-ref: ${{ github.ref }} 14 | exo-blas-ref: main 15 | -------------------------------------------------------------------------------- /.github/workflows/gemmini.yml: -------------------------------------------------------------------------------- 1 | name: Gemmini CI 2 | on: 3 | push: 4 | branches: 5 | - main 6 | pull_request: 7 | jobs: 8 | gemmini: 9 | runs-on: ubuntu-22.04 10 | 11 | container: 12 | image: ghcr.io/exo-lang/gemmini:latest 13 | credentials: 14 | username: ${{ github.actor }} 15 | password: ${{ secrets.github_token }} 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | - uses: actions/setup-python@v4 20 | with: 21 | python-version: '3.10' 22 | 23 | - name: Install dependencies 24 | run: | 25 | apt-get update 26 | apt-get install -y ninja-build 27 | python -m pip install -r requirements.txt 28 | python -m pip install -r dev-requirements.txt 29 | python -m pip install cmake build 30 | 31 | - name: Install exo 32 | run: | 33 | python -m build 34 | python -m pip install dist/*.whl 35 | 36 | - name: Configure, build, and run GEMMINI tests 37 | shell: bash 38 | run: | 39 | cd /root/chipyard 40 | source $(conda info --base)/etc/profile.d/conda.sh 41 | source env.sh 42 | cd - 43 | cmake -G Ninja -S apps/gemmini -B build -DCMAKE_BUILD_TYPE=Release 44 | cmake --build build --verbose 45 | cd build 46 | ctest -V 47 | -------------------------------------------------------------------------------- /.github/workflows/linting.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | on: 3 | pull_request_target: 4 | jobs: 5 | lint: 6 | runs-on: ubuntu-latest 7 | steps: 8 | - uses: actions/checkout@v4 9 | with: 10 | ref: ${{ github.event.pull_request.head.ref }} 11 | 12 | # Format C files 13 | - uses: DoozyX/clang-format-lint-action@v0.13 14 | with: 15 | source: '.' 16 | exclude: './dependencies ./tests/gemmini/gemmini-rocc-tests' 17 | clangFormatVersion: 12 18 | inplace: ${{ github.event.pull_request.head.repo.full_name == github.repository }} 19 | 20 | # Format Python files 21 | - uses: psf/black@stable 22 | with: 23 | options: "" 24 | version: "22.10.0" 25 | 26 | # Push changes to internal PRs 27 | - uses: EndBug/add-and-commit@v4 28 | if: github.event.pull_request.head.repo.full_name == github.repository 29 | with: 30 | message: '🤖 apply linter changes (will not trigger CI)' 31 | env: 32 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 33 | -------------------------------------------------------------------------------- /.github/workflows/packaging.yml: -------------------------------------------------------------------------------- 1 | name: Packaging 2 | on: 3 | release: 4 | types: [ created ] 5 | push: 6 | branches: [ 'main' ] 7 | pull_request: 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - uses: actions/setup-python@v4 15 | 16 | - name: Install Python dependencies 17 | run: | 18 | python -m pip install -U pip 19 | python -m pip install setuptools wheel twine build 20 | 21 | - name: Build Python package 22 | run: python -m build --sdist --wheel --outdir dist/ . 23 | 24 | # Will only run on release. 25 | - name: Publish distribution to PyPI 26 | if: startsWith(github.ref, 'refs/tags') 27 | uses: pypa/gh-action-pypi-publish@release/v1 28 | with: 29 | password: ${{ secrets.PYPI_API_TOKEN }} 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__ 3 | 4 | .coverage 5 | htmlcov 6 | 7 | tmp/ 8 | gemmini_build/ 9 | amx_build/ 10 | 11 | .ipynb_checkpoints 12 | .exo_c_cache 13 | 14 | venv/ 15 | dist/ 16 | 17 | *.egg-info/ 18 | 19 | *.swp 20 | *~ 21 | *# 22 | build 23 | autom4te.cache 24 | configure 25 | 26 | dependencies/chipyard 27 | 28 | .vscode 29 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "dependencies/benchmark"] 2 | path = dependencies/benchmark 3 | url = https://github.com/google/benchmark.git 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 22.10.0 4 | hooks: 5 | - id: black 6 | language_version: python3 7 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 exo-lang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include pyproject.toml 2 | 3 | # Include the README 4 | include *.md 5 | 6 | # Include setup.py 7 | include setup.py 8 | 9 | # Include CMake support files 10 | recursive-include src/exo/cmake * 11 | 12 | # Include C source resources from exo.libs 13 | recursive-include src/exo/libs * 14 | -------------------------------------------------------------------------------- /apps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21) 2 | project(apps) 3 | 4 | if (PROJECT_IS_TOP_LEVEL) 5 | include(CTest) 6 | endif () 7 | 8 | add_subdirectory(x86) 9 | -------------------------------------------------------------------------------- /apps/CMakePresets.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 3, 3 | "cmakeMinimumRequired": { 4 | "major": 3, 5 | "minor": 21, 6 | "patch": 0 7 | }, 8 | "configurePresets": [ 9 | { 10 | "name": "gha-Linux", 11 | "displayName": "GHA (Linux)", 12 | "description": "GitHub Actions Linux build with Intel SDE emulator", 13 | "generator": "Ninja", 14 | "cacheVariables": { 15 | "CMAKE_C_FLAGS": "-march=skylake-avx512", 16 | "CMAKE_CXX_FLAGS": "-march=skylake-avx512", 17 | "CMAKE_CROSSCOMPILING_EMULATOR": "$env{SDE_PATH}/sde64;--", 18 | "CMAKE_BUILD_TYPE": "Release" 19 | } 20 | }, 21 | { 22 | "name": "gha-macOS", 23 | "displayName": "GHA (macOS)", 24 | "description": "GitHub Actions macOS build with Intel SDE emulator", 25 | "generator": "Ninja", 26 | "cacheVariables": { 27 | "CMAKE_C_FLAGS": "-march=skylake-avx512", 28 | "CMAKE_CXX_FLAGS": "-march=skylake-avx512", 29 | "CMAKE_CROSSCOMPILING_EMULATOR": "$env{SDE_PATH}/sde64;--", 30 | "CMAKE_BUILD_TYPE": "Release", 31 | "Python3_ROOT_DIR": "$env{pythonLocation}" 32 | } 33 | } 34 | ] 35 | } 36 | -------------------------------------------------------------------------------- /apps/aarch64/filter1D/.gitignore: -------------------------------------------------------------------------------- 1 | test/ 2 | -------------------------------------------------------------------------------- /apps/aarch64/filter1D/filter1D_week1.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from exo import * 4 | from exo.platforms.neon import * 5 | from exo.stdlib.scheduling import * 6 | 7 | neon_instructions = [ 8 | neon_zero_4xf32, 9 | neon_vfmadd_4xf32_1xf32, 10 | neon_vld_4xf32, 11 | neon_vst_4xf32, 12 | ] 13 | 14 | 15 | @proc 16 | def filter1D(ow: size, kw: size, x: f32[ow + kw - 1], y: f32[ow], w: f32[kw]): 17 | for o in seq(0, ow): 18 | y[o] = 0.0 19 | for k in seq(0, kw): 20 | y[o] += x[o + k] * w[k] 21 | 22 | 23 | # divide 24 | filter1D = divide_loop(filter1D, "o", 4, ["outXo", "outXi"], tail="cut_and_guard") 25 | 26 | # stage sum 27 | filter1D = simplify( 28 | stage_mem(filter1D, "for outXi in _:_", "y[4*outXo:4*outXo+4]", "sum") 29 | ) 30 | filter1D = fission(filter1D, filter1D.find("sum[_] = 0.0").after()) 31 | filter1D = reorder_loops(filter1D, "outXi k") 32 | 33 | # stage x 34 | filter1D = simplify( 35 | stage_mem(filter1D, "for outXi in _:_ #1", "x[k+4 * outXo: k+4*outXo + 4]", "xX4") 36 | ) 37 | 38 | # set memories & precision 39 | filter1D = set_memory(filter1D, "sum", Neon) 40 | filter1D = set_memory(filter1D, "xX4", Neon) 41 | 42 | # replace 43 | filter1D = replace_all(filter1D, neon_instructions) 44 | 45 | print(filter1D) 46 | 47 | __all__ = ["filter1D"] 48 | -------------------------------------------------------------------------------- /apps/aarch64/sgemm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21) 2 | project(aarch64_sgemm) 3 | 4 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -ffp-contract=fast -framework Accelerate -save-temps=obj") 5 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math -ffp-contract=fast -framework Accelerate -save-temps=obj") 6 | set(CMAKE_PREFIX_PATH "${CMAKE_PREFIX_PATH}:../../../src/exo/cmake") 7 | 8 | if (EXISTS "$ENV{PYENV_ROOT}") 9 | execute_process( 10 | COMMAND "$ENV{PYENV_ROOT}/shims/python" "-c" "import platform ; print(platform.python_version())" 11 | OUTPUT_VARIABLE python_version 12 | OUTPUT_STRIP_TRAILING_WHITESPACE 13 | ) 14 | set(Python3_ROOT_DIR "$ENV{PYENV_ROOT}/versions/${python_version}" 15 | CACHE PATH "Path to Python3 root directory") 16 | endif () 17 | 18 | find_package(Exo REQUIRED) 19 | 20 | # ---------------------------------------------------------------------------- # 21 | # Exo libraries 22 | 23 | add_exo_library( 24 | NAME sgemm 25 | SOURCES sgemm.py 26 | ) 27 | 28 | # ---------------------------------------------------------------------------- # 29 | 30 | add_executable(test naive_sgemm.cpp test.cpp) 31 | target_link_libraries(test PRIVATE aarch64_sgemm::sgemm) 32 | target_compile_features(test PRIVATE cxx_std_17) 33 | -------------------------------------------------------------------------------- /apps/aarch64/sgemm/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## Constants 4 | 5 | ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." >/dev/null 2>&1 && pwd)" 6 | 7 | ## Build dependencies 8 | 9 | # Ensure Exo is up to date 10 | if [ "$1" = "update" ]; then 11 | (cd "${ROOT_DIR}" && pip uninstall -y exo-lang && python -m build && 12 | pip install dist/*.whl) 13 | fi 14 | 15 | # set up cmake build 16 | cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_OSX_ARCHITECTURES=arm64 17 | 18 | # do the build 19 | cmake --build build --verbose 20 | 21 | # run a single case 22 | ./build/test 768 23 | -------------------------------------------------------------------------------- /apps/aarch64/sgemm/naive_sgemm.h: -------------------------------------------------------------------------------- 1 | #ifndef NAIVE_SGEMM_H 2 | #define NAIVE_SGEMM_H 3 | 4 | void naive_sgemm_square(const float *a, const float *b, float *c, long n); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /apps/gemmini/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.22) 2 | 3 | # NOTICE: this project only has any hope of compiling with the RISC-V toolchain 4 | # provided by UCB-BAR. Thus, the included toolchain file is set by default, even 5 | # though this is typically a bad idea. 6 | set(CMAKE_TOOLCHAIN_FILE "${CMAKE_CURRENT_LIST_DIR}/cmake/gemmini.cmake" 7 | CACHE FILEPATH "CMake toolchain file") 8 | 9 | project(exo-gemmini LANGUAGES C ASM) 10 | 11 | enable_testing() 12 | 13 | # Naughty 14 | add_compile_options( 15 | -Wno-pointer-to-int-cast 16 | -Wno-int-to-pointer-cast 17 | -Wno-incompatible-pointer-types 18 | -Wno-discarded-qualifiers 19 | ) 20 | 21 | # ---------------------------------------------------------------------------- # 22 | # Project-wide configuration 23 | 24 | if (PROJECT_IS_TOP_LEVEL) 25 | include(CTest) 26 | endif () 27 | 28 | # ---------------------------------------------------------------------------- # 29 | # Dependencies 30 | 31 | ## Exo 32 | find_package(Exo REQUIRED) 33 | 34 | ## gemmini-rocc-tests 35 | include(FetchContent) 36 | 37 | FetchContent_Declare( 38 | gemmini-rocc-tests 39 | GIT_REPOSITORY https://github.com/exo-lang/gemmini-rocc-tests.git 40 | GIT_TAG exo-v3 41 | ) 42 | 43 | FetchContent_MakeAvailable(gemmini-rocc-tests) 44 | 45 | # ---------------------------------------------------------------------------- # 46 | # Benchmarks 47 | 48 | add_subdirectory(src) 49 | add_subdirectory(test) 50 | -------------------------------------------------------------------------------- /apps/gemmini/cmake/gemmini.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.22) 2 | 3 | set(CMAKE_SYSTEM_NAME Linux) 4 | set(CMAKE_SYSTEM_PROCESSOR riscv64) 5 | 6 | set(RISCV "$ENV{RISCV}" CACHE PATH "Path to RISCV toolchain") 7 | 8 | set(CMAKE_SYSROOT "${RISCV}/sysroot") 9 | set(CMAKE_CROSSCOMPILING_EMULATOR "${RISCV}/bin/spike" --extension=gemmini) 10 | 11 | set(CMAKE_C_COMPILER "${RISCV}/bin/riscv64-unknown-elf-gcc") 12 | set(CMAKE_CXX_COMPILER "${RISCV}/bin/riscv64-unknown-elf-g++") 13 | 14 | add_compile_definitions(BAREMETAL=1) 15 | add_compile_options(-mcmodel=medany -fno-tree-loop-distribute-patterns -fno-builtin-printf -fno-common) 16 | add_link_options(-nostartfiles -nostdlib) 17 | -------------------------------------------------------------------------------- /apps/gemmini/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- # 2 | # Gemmini platform library 3 | 4 | add_subdirectory(platform) 5 | 6 | 7 | # ---------------------------------------------------------------------------- # 8 | # Exo implementations 9 | 10 | add_exo_library( 11 | NAME gemmini_lib 12 | SOURCES exo/conv.py exo/matmul.py 13 | ) 14 | target_include_directories(gemmini_lib SYSTEM PRIVATE "${gemmini-rocc-tests_SOURCE_DIR}") 15 | target_link_libraries(gemmini_lib PUBLIC exo-gemmini::platform) 16 | -------------------------------------------------------------------------------- /apps/gemmini/src/platform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- # 2 | # Library code 3 | 4 | add_library( 5 | exo-gemmini_platform 6 | gemm_acc_malloc.c 7 | gemm_acc_malloc.h 8 | gemm_malloc.c 9 | gemm_malloc.h 10 | ) 11 | add_library(exo-gemmini::platform ALIAS exo-gemmini_platform) 12 | target_include_directories(exo-gemmini_platform PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}") 13 | -------------------------------------------------------------------------------- /apps/gemmini/src/platform/gemm_acc_malloc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef GEMM_ACC_MALLOC_H 4 | #define GEMM_ACC_MALLOC_H 5 | 6 | #include 7 | 8 | void gemm_acc_init_mem(void); 9 | uint32_t gemm_acc_malloc(long unsigned int size); 10 | void gemm_acc_free(uint32_t addr); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /apps/gemmini/src/platform/gemm_malloc.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef GEMM_MALLOC_H 4 | #define GEMM_MALLOC_H 5 | 6 | #include 7 | 8 | void gemm_init_mem(void); 9 | uint32_t gemm_malloc(long unsigned int size); 10 | void gemm_free(uint32_t addr); 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /apps/gemmini/test/bare-metal/shims/include/gemmini.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef EXO_SHIMS_GEMMINI_H 4 | #define EXO_SHIMS_GEMMINI_H 5 | 6 | // Symbol overrides 7 | #define tiled_matmul_auto orig_tiled_matmul_auto 8 | #define tiled_conv_auto orig_tiled_conv_auto 9 | 10 | #include "@gemmini-rocc-tests_SOURCE_DIR@/include/gemmini.h" 11 | 12 | #undef tiled_matmul_auto 13 | #undef tiled_conv_auto 14 | 15 | void tiled_matmul_auto(size_t dim_I, size_t dim_J, size_t dim_K, 16 | const elem_t* A, const elem_t* B, const void * D, void * C, size_t stride_A, 17 | size_t stride_B, size_t stride_D, size_t stride_C, scale_t A_scale_factor, 18 | scale_t B_scale_factor, scale_acc_t D_scale_factor, int act, 19 | acc_scale_t scale, acc_scale_t bert_scale, bool repeating_bias, 20 | bool transpose_A, bool transpose_B, bool full_C, bool low_D, 21 | uint8_t weightA, enum tiled_matmul_type_t tiled_matmul_type); 22 | 23 | void tiled_conv_auto( int batch_size, int in_row_dim, int in_col_dim, 24 | int in_channels, int out_channels, int out_row_dim, int out_col_dim, int stride, 25 | int input_dilation, int kernel_dilation, int padding, int kernel_dim, 26 | bool wrot180, bool trans_output_1203, bool trans_input_3120, 27 | bool trans_weight_1203, bool trans_weight_0132, 28 | const elem_t * input, const elem_t * weights, const acc_t * bias, 29 | elem_t * output, 30 | int act, acc_scale_t scale, 31 | int pool_size, int pool_stride, int pool_padding, 32 | enum tiled_matmul_type_t tiled_conv_type); 33 | 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /apps/gemmini/test/bare-metal/shims/include/gemmini_counter.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef EXO_SHIMS_GEMMINI_COUNTER_H 4 | #define EXO_SHIMS_GEMMINI_COUNTER_H 5 | 6 | #include "@gemmini-rocc-tests_SOURCE_DIR@/include/gemmini_counter.h" 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /apps/gemmini/test/bare-metal/shims/include/gemmini_params.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef EXO_SHIMS_GEMMINI_PARAMS_H 4 | #define EXO_SHIMS_GEMMINI_PARAMS_H 5 | 6 | #define HAS_MVIN_ACC_SCALE 7 | #define MVIN_SCALE_ACC(x, scale) \ 8 | ({float y = ROUND_NEAR_EVEN((x) * (scale)); y > INT32_MAX ? INT32_MAX : (y < INT32_MIN ? INT32_MIN : (acc_t)y);}) 9 | 10 | #include "@gemmini-rocc-tests_SOURCE_DIR@/include/gemmini_params.h" 11 | 12 | #endif 13 | -------------------------------------------------------------------------------- /apps/gemmini/test/bare-metal/shims/include/gemmini_testutils.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef EXO_SHIMS_GEMMINI_TESTUTILS_H 4 | #define EXO_SHIMS_GEMMINI_TESTUTILS_H 5 | 6 | #include "@gemmini-rocc-tests_SOURCE_DIR@/include/gemmini_testutils.h" 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /apps/gemmini/test/bare-metal/shims/rocc-software/src/xcustom.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef EXO_SHIMS_ROCC_SOFTWARE_XCUSTOM_H 4 | #define EXO_SHIMS_ROCC_SOFTWARE_XCUSTOM_H 5 | 6 | #include "@gemmini-rocc-tests_SOURCE_DIR@/rocc-software/src/xcustom.h" 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /apps/gemmini/test/helpers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef EXO_APPS_GEMMINI_HELPERS_H 4 | #define EXO_APPS_GEMMINI_HELPERS_H 5 | 6 | #include 7 | 8 | void print_2i8(int N, int M, int8_t *data); 9 | void print_4i8(int N, int M, int K, int R, int8_t *data); 10 | void print_2i32(int N, int M, int32_t *data); 11 | bool check_eq_2i8(int N, int M, int8_t *lhs, int8_t *rhs); 12 | bool check_eq_4i8(int N, int M, int K, int R, int8_t *lhs, int8_t *rhs); 13 | bool check_eq_2i32(int N, int M, int32_t *lhs, int32_t *rhs); 14 | 15 | #endif // EXO_APPS_GEMMINI_HELPERS_H 16 | -------------------------------------------------------------------------------- /apps/x86/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21) 2 | project(x86 LANGUAGES C CXX) 3 | 4 | # ---------------------------------------------------------------------------- # 5 | # Project-wide configuration 6 | 7 | if (PROJECT_IS_TOP_LEVEL) 8 | include(CTest) 9 | endif () 10 | 11 | 12 | # ---------------------------------------------------------------------------- # 13 | # Dependencies 14 | 15 | ## Exo 16 | find_package(Exo REQUIRED) 17 | 18 | ## oneAPI MKL 19 | set(MKL_ARCH "intel64" 20 | CACHE STRING "MKL architecture. Options: intel64, ia32") 21 | set(MKL_LINK "static" 22 | CACHE STRING "MKL link type. Options: dynamic, static, sdl") 23 | set(MKL_THREADING "sequential" 24 | CACHE STRING "MKL threading model. Options: sequential, intel_thread, gnu_thread, pgi_thread, tbb_thread") 25 | set(MKL_INTERFACE "lp64" 26 | CACHE STRING "MKL interface type. Options: lp64, ilp64") 27 | set(MKL_MPI "openmpi" 28 | CACHE STRING "Which MPI interface to use. Options: intelmpi, mpich, openmpi, msmpi, mshpc") 29 | 30 | find_package(MKL REQUIRED) 31 | 32 | ## oneAPI DNNL 33 | set(DNNL_CONFIGURATION "cpu_gomp" 34 | CACHE STRING "DNNL backend. Options: cpu_dpcpp_gpu_dpcpp, cpu_(gomp|iomp), cpu_tbb") 35 | 36 | find_package(dnnl REQUIRED) 37 | 38 | ## Google Benchmark 39 | find_package(benchmark REQUIRED) 40 | 41 | 42 | # ---------------------------------------------------------------------------- # 43 | # Benchmarks 44 | 45 | add_subdirectory(conv) 46 | add_subdirectory(resnet) 47 | add_subdirectory(sgemm) 48 | add_subdirectory(ssyrk) 49 | -------------------------------------------------------------------------------- /apps/x86/conv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- # 2 | # Conv implementations 3 | 4 | find_package(Halide REQUIRED) 5 | 6 | ## Exo 7 | add_exo_library( 8 | NAME conv 9 | SOURCES conv.py 10 | ) 11 | 12 | ## Halide 13 | add_executable(halide_conv.gen halide_conv_gen.cpp) 14 | target_link_libraries(halide_conv.gen PRIVATE Halide::Generator) 15 | 16 | add_halide_library(halide_conv_kernel FROM halide_conv.gen 17 | STMT_HTML _) 18 | 19 | ## Common object library 20 | add_library(conv_impls OBJECT exo_conv.cpp onednn_conv.cpp halide_conv.cpp) 21 | target_link_libraries( 22 | conv_impls 23 | PUBLIC DNNL::dnnl 24 | PRIVATE x86::conv halide_conv_kernel 25 | ) 26 | target_compile_features(conv_impls PUBLIC cxx_std_17) 27 | 28 | 29 | # ---------------------------------------------------------------------------- # 30 | # Test harness 31 | 32 | # Benchmarking 33 | add_executable(bench_conv bench_conv.cpp) 34 | target_link_libraries(bench_conv PRIVATE benchmark::benchmark_main conv_impls) 35 | target_compile_features(bench_conv PRIVATE cxx_std_17) 36 | 37 | # Correctness test 38 | add_executable(test_conv test_conv.cpp) 39 | target_link_libraries(test_conv PRIVATE conv_impls) 40 | target_compile_features(test_conv PRIVATE cxx_std_17) 41 | -------------------------------------------------------------------------------- /apps/x86/conv/Halide_Conv_Schedule.txt: -------------------------------------------------------------------------------- 1 | produce relu: 2 | parallel c.co in [0, 1]: 3 | parallel n in [0, 4]: 4 | parallel y in [0, 79]: 5 | for x.xo in [0, 19]: 6 | produce conv: 7 | unrolled x: 8 | unrolled c.c in [0, 3]: 9 | vectorized c.v4 in [0, 15]: 10 | conv(...) = ... 11 | for k in [0, 2]: 12 | for k in [0, 2]: 13 | for k.k in [0, 63]: // kc - input channels 14 | produce filter_im_global_wrapper: 15 | unrolled _3: // 2-unrolled kc 16 | unrolled _0._0 in [0, 3]: // oc - output channels (split) 17 | vectorized _0.v15 in [0, 15]: 18 | filter_im_global_wrapper(...) = ... 19 | consume filter_im_global_wrapper: 20 | unrolled k.r68 in [0, 1]: // 2-unrolled kc 21 | unrolled x: 22 | produce input_im_global_wrapper: 23 | input_im_global_wrapper(...) = ... 24 | consume input_im_global_wrapper: 25 | unrolled c.c in [0, 3]: 26 | vectorized c.v8 in [0, 15]: 27 | conv(...) = ... 28 | consume conv: 29 | unrolled x.xi in [0, 4]: 30 | unrolled c.ci.ci in [0, 3]: 31 | vectorized c.ci.v3 in [0, 15]: 32 | relu(...) = ... 33 | -------------------------------------------------------------------------------- /apps/x86/conv/exo_conv.cpp: -------------------------------------------------------------------------------- 1 | #include "exo_conv.hpp" 2 | 3 | #include 4 | 5 | #include "conv.h" 6 | 7 | void exo_conv(conv_instance &ci) { 8 | if (ci.OH != 80 || ci.OW != 100 || ci.OC != 128 || ci.KW != 3 || ci.N != 5) { 9 | abort(); 10 | } 11 | #if 0 12 | conv(nullptr, (int)ci.OH, (int)ci.OW, (int)ci.OC, (int)ci.IH, (int)ci.IW, 13 | (int)ci.IC, (int)ci.KW, (int)ci.N, ci.src_data.data(), 14 | ci.dst_data.data(), ci.weights_data.data(), ci.bias_data.data()); 15 | #else 16 | conv_specialized(nullptr, ci.src_data.data(), ci.dst_data.data(), 17 | ci.weights_data.data(), ci.bias_data.data()); 18 | #endif 19 | } 20 | -------------------------------------------------------------------------------- /apps/x86/conv/exo_conv.hpp: -------------------------------------------------------------------------------- 1 | #ifndef EXO_CONV_H 2 | #define EXO_CONV_H 3 | 4 | #include "conv_instance.hpp" 5 | 6 | void exo_conv(conv_instance &ci); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /apps/x86/conv/halide_conv.cpp: -------------------------------------------------------------------------------- 1 | #include "halide_conv.hpp" 2 | 3 | #include 4 | #include 5 | 6 | using namespace Halide::Runtime; 7 | 8 | void halide_conv(conv_instance &ci) { 9 | Buffer input(ci.src_data.data(), ci.IC, ci.IW, ci.IH, ci.N); 10 | Buffer weights(ci.weights_data.data(), ci.OC, ci.KW, ci.KH, ci.IC); 11 | Buffer bias(ci.bias_data.data(), ci.OC); 12 | Buffer output(ci.dst_data.data(), ci.OC, ci.OW, ci.OH, ci.N); 13 | halide_conv_kernel(input, weights, bias, output); 14 | } 15 | -------------------------------------------------------------------------------- /apps/x86/conv/halide_conv.hpp: -------------------------------------------------------------------------------- 1 | #ifndef HALIDE_CONV_H 2 | #define HALIDE_CONV_H 3 | 4 | #include "conv_instance.hpp" 5 | 6 | void halide_conv(conv_instance &ci); 7 | 8 | #endif // HALIDE_CONV_H 9 | -------------------------------------------------------------------------------- /apps/x86/conv/onednn_conv.hpp: -------------------------------------------------------------------------------- 1 | #ifndef ONEDNN_CONV_H 2 | #define ONEDNN_CONV_H 3 | 4 | #include 5 | 6 | #include "conv_instance.hpp" 7 | 8 | class OneDNN_Conv { 9 | conv_instance &ci; 10 | 11 | dnnl::engine engine{dnnl::engine::kind::cpu, 0}; 12 | dnnl::stream engine_stream{engine}; 13 | dnnl::memory user_dst_mem; 14 | dnnl::memory conv_dst_mem; 15 | dnnl::convolution_forward::primitive_desc conv_pd; 16 | std::unordered_map conv_args; 17 | dnnl::convolution_forward conv_prim; 18 | 19 | public: 20 | OneDNN_Conv(conv_instance &ci); 21 | void run(); 22 | }; 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /apps/x86/halide/blur/.gitignore: -------------------------------------------------------------------------------- 1 | png_process 2 | __pycache__ 3 | blur.d 4 | -------------------------------------------------------------------------------- /apps/x86/halide/blur/README.md: -------------------------------------------------------------------------------- 1 | Run the following command to compile and run. 2 | ``` 3 | exocc -o blur --stem blur blur.py 4 | g++ -o png_process main.cpp blur/blur.c -lpng -mavx2 -fopenmp; ./png_process 5 | ``` 6 | 7 | The Halide schedule should be around 17x faster. Then run this to check output equivalence: 8 | ``` 9 | diff blur.png exo_blur_halide.png 10 | ``` -------------------------------------------------------------------------------- /apps/x86/halide/blur/gray.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/apps/x86/halide/blur/gray.png -------------------------------------------------------------------------------- /apps/x86/halide/blur/gray_scaled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/apps/x86/halide/blur/gray_scaled.png -------------------------------------------------------------------------------- /apps/x86/resnet/CMakeLists.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/apps/x86/resnet/CMakeLists.txt -------------------------------------------------------------------------------- /apps/x86/sgemm/alex_sgemm.h: -------------------------------------------------------------------------------- 1 | #ifndef ALEX_SGEMM_H 2 | #define ALEX_SGEMM_H 3 | 4 | void sgemm_square(const float *a, const float *b, float *c, long n); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /apps/x86/sgemm/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "alex_sgemm.h" 10 | 11 | static std::vector gen_matrix(long m, long n) { 12 | static std::random_device rd; 13 | static std::mt19937 rng{rd()}; 14 | std::uniform_real_distribution<> rv{-1.0f, 1.0f}; 15 | 16 | std::vector mat(m * n); 17 | std::generate(std::begin(mat), std::end(mat), [&]() { return rv(rng); }); 18 | 19 | return mat; 20 | } 21 | 22 | int main(int argc, char *argv[]) { 23 | if (argc != 2) { 24 | printf("Usage: %s \n", argv[0]); 25 | return 1; 26 | } 27 | int n = std::atoi(argv[1]); 28 | if (n < 1) { 29 | printf("n < 1!!\n"); 30 | return 1; 31 | } 32 | 33 | auto a = gen_matrix(n, n); 34 | auto b = gen_matrix(n, n); 35 | auto c = gen_matrix(n, n); 36 | auto c2 = c; 37 | 38 | sgemm_exo(nullptr, n, n, n, a.data(), b.data(), c.data()); 39 | sgemm_square(a.data(), b.data(), c2.data(), n); 40 | 41 | for (int i = 0; i < c2.size(); i++) { 42 | float expected = c2[i]; 43 | float actual = c[i]; 44 | double relerr = fabsf(actual - expected) / expected; 45 | if (relerr > 1e-3) { 46 | printf("index %d: %.6f != %.6f (expected)\n", i, actual, expected); 47 | } 48 | } 49 | 50 | printf("didn't crash, yay\n"); 51 | } 52 | -------------------------------------------------------------------------------- /apps/x86/ssyrk/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------------------------------------- # 2 | # Test harness 3 | 4 | add_executable(bench_ssyrk bench_ssyrk.cpp) 5 | target_link_libraries( 6 | bench_ssyrk 7 | PRIVATE 8 | MKL::MKL 9 | benchmark::benchmark_main 10 | ) 11 | target_compile_features(bench_ssyrk PRIVATE cxx_std_11) 12 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | black==24.10.0 2 | coverage==7.8.2 3 | pre-commit==4.2.0 4 | pytest-cov==6.1.1 5 | pytest-xdist==3.7.0 6 | pytest==8.3.5 7 | tox==4.26.0 8 | numpy==2.2.3 9 | Pillow==11.2.1 10 | -------------------------------------------------------------------------------- /docs/images/system-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/docs/images/system-overview.png -------------------------------------------------------------------------------- /docs/primitives/config_ops.md: -------------------------------------------------------------------------------- 1 | 2 | ## Configuration modifying primitives 3 | 4 | #### `bind_config(proc, var_cursor, config, field)` 5 | Extracts a control-value expression and write it into some designated field of a config. 6 | ``` 7 | args: 8 | var_cursor - cursor or pattern pointing at the expression to be bound 9 | config - config object to be written into 10 | field - (string) the field of `config` to be written to 11 | 12 | rewrite: 13 | Let s[ e ] mean a statement with control expression e occurring within it. Then, 14 | s[ e ] 15 | --> 16 | config.field = e 17 | s[ config.field ] 18 | ``` 19 | 20 | #### `delete_config(proc, stmt_cursor)` 21 | Delete a statement that writes to some config.field. 22 | ``` 23 | args: 24 | stmt_cursor - cursor or pattern pointing at the statement to 25 | be deleted 26 | 27 | rewrite: 28 | s1 29 | config.field = _ 30 | s3 31 | --> 32 | s1 33 | s3 34 | ``` 35 | 36 | #### `write_config(proc, gap_cursor, config, field, rhs)` 37 | Inserts a statement that writes a desired value to some config.field. 38 | ``` 39 | args: 40 | gap_cursor - cursor pointing to where the new write statement should be inserted 41 | config - config object to be written into 42 | field - (string) the field of `config` to be written to 43 | rhs - (string) the expression to write into the field 44 | 45 | rewrite: 46 | s1 47 | s3 48 | --> 49 | s1 50 | config.field = new_expr 51 | s3 52 | ``` 53 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Scheduling Examples 2 | 3 | All the scheduling examples are hosted on [the compiler explorer](https://godbolt.exo-lang.dev/). 4 | 5 | This directory contains several examples, along with documentation and code. 6 | If you are new to Exo, we recommend going through the examples in the following order: 7 | 8 | 1. [AVX2 Matmul](./avx2_matmul/README.md): This example demonstrates how to take a simple matrix multiplication kernel and transform it into an implementation that can make use of AVX2 instructions. It provides an overview of Exo and its scheduling system. 9 | 10 | 2. [Cursor](./cursors/README.md): This example shows how to use Cursors to efficiently write schedules and define a new scheduling operator. 11 | 12 | 3. [RVM](./rvm_conv1d/README.md): This example illustrates how to use Exo to define and target a new hardware accelerator entirely in the user code. 13 | 14 | 4. Quizzes ([quiz1](./quiz1/README.md), [quiz2](./quiz2/README.md), [quiz3](./quiz3/README.md)) contain common scheduling mistakes in Exo and solutions to fix them. The best way to learn a programming language is by debugging code. 15 | 16 | -------------------------------------------------------------------------------- /examples/avx2_matmul/Makefile: -------------------------------------------------------------------------------- 1 | CFLAGS ?= -march=native 2 | 3 | avx2_matmul: avx2_matmul.o main.o 4 | 5 | avx2_matmul.c: x86_matmul.py 6 | exocc -o . --stem $(*F) $^ 7 | 8 | main.c: avx2_matmul.c 9 | 10 | .PHONY: clean 11 | clean: 12 | $(RM) avx2_matmul avx2_matmul.* *.o exo_demo 13 | $(RM) -r __pycache__/ 14 | -------------------------------------------------------------------------------- /examples/avx2_matmul/main.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "avx2_matmul.h" 5 | 6 | #define K 2048 7 | static float A[6 * K]; 8 | static float B[K * 16]; 9 | static float C[6 * 16]; 10 | 11 | void initialize() { 12 | for (int i = 0; i < 6; i++) { 13 | for (int j = 0; j < K; j++) { 14 | A[i * K + j] = 3.2; 15 | } 16 | } 17 | for (int i = 0; i < K; i++) { 18 | for (int j = 0; j < 16; j++) { 19 | B[i * 16 + j] = 0.2; 20 | } 21 | } 22 | for (int i = 0; i < 6; i++) { 23 | for (int j = 0; j < 16; j++) { 24 | C[i * 16 + j] = 0.0; 25 | } 26 | } 27 | return; 28 | } 29 | 30 | int main() { 31 | clock_t start, end; 32 | int msec; 33 | 34 | // Calling original matmul 35 | start = clock(); 36 | for (int i = 0; i < 1000; i++) 37 | rank_k_reduce_6x16(NULL, K, A, B, C); 38 | end = clock(); 39 | 40 | msec = (end - start) * 1000 / CLOCKS_PER_SEC; 41 | printf("Time taken for original matmul: %d seconds %d milliseconds\n", 42 | msec / 1000, msec % 1000); 43 | 44 | // Calling scheduled matmul 45 | start = clock(); 46 | for (int i = 0; i < 1000; i++) 47 | rank_k_reduce_6x16_scheduled(NULL, K, A, B, C); 48 | end = clock(); 49 | 50 | msec = (end - start) * 1000 / CLOCKS_PER_SEC; 51 | printf("Time taken for scheduled matmul: %d seconds %d milliseconds\n", 52 | msec / 1000, msec % 1000); 53 | 54 | return (0); 55 | } 56 | -------------------------------------------------------------------------------- /examples/cursors/.gitignore: -------------------------------------------------------------------------------- 1 | cursors/ 2 | -------------------------------------------------------------------------------- /examples/cursors/README.md: -------------------------------------------------------------------------------- 1 | # Cursor Step-by-Step Tutorial 2 | 3 | This example demonstrates Cursors using the tile2D example (as shown in our [ASPLOS '25 paper](https://arxiv.org/abs/2411.07211)). 4 | 5 | ## Overview 6 | 7 | This example covers the key concepts presented in the paper: 8 | - Finding Cursors with pattern-matching 9 | - Cursor navigation 10 | - Applying scheduling primitives using cursors 11 | - Cursor forwarding after code transformations 12 | - Defining a new scheduling operation 13 | 14 | ## Getting Started 15 | 16 | To run this example: 17 | ```bash 18 | exocc cursors.py 19 | ``` 20 | Running `exocc` on `cursors.py` will generate the C code in the `cursors/cursors.c` file. 21 | It will also print out the intermediate steps of the example. 22 | 23 | -------------------------------------------------------------------------------- /examples/quiz1/.gitignore: -------------------------------------------------------------------------------- 1 | quiz1/ 2 | -------------------------------------------------------------------------------- /examples/quiz2/.gitignore: -------------------------------------------------------------------------------- 1 | quiz2/ 2 | -------------------------------------------------------------------------------- /examples/quiz2/quiz2.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from exo import * 4 | from exo.stdlib.scheduling import * 5 | 6 | 7 | @proc 8 | def scaled_add(N: size, a: f32[N], b: f32[N], c: f32[N]): 9 | assert N % 8 == 0 10 | for i in seq(0, N): 11 | c[i] = 2 * a[i] + 3 * b[i] 12 | 13 | 14 | def stage_exprs(p, num_vectors, assign): 15 | if isinstance(assign.rhs(), BinaryOpCursor): 16 | p = bind_expr(p, assign.rhs().lhs(), "vec") 17 | num_vectors += 1 18 | p, num_vectors = stage_exprs(p, num_vectors, p.forward(assign).prev()) 19 | 20 | p = bind_expr(p, assign.rhs().rhs(), "vec") 21 | num_vectors += 1 22 | p, num_vectors = stage_exprs(p, num_vectors, p.forward(assign).prev()) 23 | return p, num_vectors 24 | 25 | 26 | def wrong_schedule(p): 27 | p = rename(p, "scaled_add_scheduled") 28 | num_vectors = 0 29 | 30 | p = divide_loop(p, "i", 8, ["io", "ii"], perfect=True) 31 | 32 | p, num_vectors = stage_exprs(p, num_vectors, p.find("c[_] = _")) 33 | 34 | for i in range(num_vectors): 35 | vector_reg = p.find(f"vec: _ #{i}") 36 | p = expand_dim(p, vector_reg, 8, "ii") 37 | p = lift_alloc(p, vector_reg) 38 | 39 | vector_assign = p.find(f"vec = _ #{i}") 40 | p = fission(p, vector_assign.after()) 41 | 42 | return p 43 | 44 | 45 | w = wrong_schedule(scaled_add) 46 | print(w) 47 | -------------------------------------------------------------------------------- /examples/quiz3/.gitignore: -------------------------------------------------------------------------------- 1 | quiz3/ 2 | -------------------------------------------------------------------------------- /examples/rvm_conv1d/.gitignore: -------------------------------------------------------------------------------- 1 | out/ -------------------------------------------------------------------------------- /examples/rvm_conv1d/Makefile: -------------------------------------------------------------------------------- 1 | PROG = conv1d 2 | OUT = out/ 3 | CC = "${RISCV}/bin/clang" 4 | SPIKE = "${RISCV}/bin/spike" 5 | ASFLAGS = -march=rv32imc_xtheadmatrix0p1 -menable-experimental-extensions 6 | CFLAGS = -O2 -g3 $(ASFLAGS) 7 | 8 | default: sim 9 | exo_comp: exo/conv1d_exo.c 10 | 11 | $(OUT)/$(PROG).elf: $(OUT)/$(PROG).o $(OUT)/conv1d_exo.o 12 | $(CC) $(LDFLAGS) -o $@ $^ 13 | 14 | $(OUT)/$(PROG).o: main.c exo/conv1d_exo.h conv1Di32.h $(OUT) 15 | $(CC) $(CFLAGS) -o $@ -c $< 16 | 17 | $(OUT)/conv1d_exo.o: exo/conv1d_exo.c $(OUT) 18 | $(CC) $(CFLAGS) -o $@ -c $< 19 | 20 | $(OUT): 21 | @mkdir -p $(OUT) 22 | 23 | exo/conv1d_exo.h: exo/conv1d_exo.c 24 | exo/conv1d_exo.c: exo/conv1d.py 25 | exocc -o exo/ --stem conv1d_exo exo/conv1d.py 26 | 27 | conv1Di32.h: gen_stimuli.py 28 | python3 $< 29 | 30 | sim: $(OUT)/$(PROG).elf 31 | @$(SPIKE) --isa=RV32IMC_xmatrix pk -s $< -------------------------------------------------------------------------------- /examples/rvm_conv1d/exo/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | conv1d_exo.c 3 | conv1d_exo.h 4 | conv1d_exo.d 5 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.pytest.ini_options] 6 | filterwarnings = [ 7 | 'ignore:lib2to3 package is deprecated:PendingDeprecationWarning' 8 | ] 9 | norecursedirs = ['dependencies'] 10 | pythonpath = "src/" 11 | markers = [ 12 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 13 | ] 14 | 15 | [tool.black] 16 | exclude = '(demo_stage\.py|dependencies/.*|build/.*)' 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | PySMT==0.9.6 2 | asdl-adt==0.1.0 3 | asdl==0.1.5 4 | build==1.2.2.post1 5 | z3-solver==4.15.0.0 6 | yapf==0.43.0 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import setuptools 3 | 4 | if __name__ == "__main__": 5 | setuptools.setup() 6 | -------------------------------------------------------------------------------- /src/exo/__init__.py: -------------------------------------------------------------------------------- 1 | from .API import ( 2 | Procedure, 3 | compile_procs, 4 | compile_procs_to_strings, 5 | proc, 6 | instr, 7 | config, 8 | ExoType, 9 | ) 10 | from .rewrite.LoopIR_scheduling import SchedulingError 11 | from .frontend.parse_fragment import ParseFragmentError 12 | from .core.configs import Config 13 | from .core.memory import Memory, DRAM 14 | from .core.extern import Extern 15 | 16 | from . import stdlib 17 | 18 | __version__ = "1.0.0" 19 | 20 | __all__ = [ 21 | "Procedure", 22 | "compile_procs", 23 | "compile_procs_to_strings", 24 | "proc", 25 | "instr", 26 | "config", 27 | "Config", 28 | "Memory", 29 | "Extern", 30 | "DRAM", 31 | "SchedulingError", 32 | "ParseFragmentError", 33 | # 34 | "stdlib", 35 | "ExoType", 36 | ] 37 | -------------------------------------------------------------------------------- /src/exo/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/src/exo/backend/__init__.py -------------------------------------------------------------------------------- /src/exo/backend/parallel_analysis.py: -------------------------------------------------------------------------------- 1 | from ..core.LoopIR import LoopIR, LoopIR_Rewrite 2 | 3 | from ..rewrite.new_eff import Check_ParallelizeLoop 4 | 5 | 6 | class ParallelAnalysis(LoopIR_Rewrite): 7 | def __init__(self): 8 | self._errors = [] 9 | 10 | def run(self, proc): 11 | assert isinstance(proc, LoopIR.proc) 12 | self.proc = proc 13 | proc = super().apply_proc(proc) 14 | if self._errors: 15 | errs = "\n".join(self._errors) 16 | raise TypeError(f"Errors occurred during precision checking:\n{errs}") 17 | return proc 18 | 19 | def err(self, node, msg): 20 | self._errors.append(f"{node.srcinfo}: {msg}") 21 | 22 | def map_s(self, s): 23 | if isinstance(s, LoopIR.For) and isinstance(s.loop_mode, LoopIR.Par): 24 | try: 25 | Check_ParallelizeLoop(self.proc, s) 26 | except: 27 | self.err( 28 | s, 29 | "parallel loop's body is not parallelizable because of potential data races", 30 | ) 31 | -------------------------------------------------------------------------------- /src/exo/cmake/AddExoLibrary.cmake: -------------------------------------------------------------------------------- 1 | function(add_exo_library) 2 | cmake_parse_arguments(PARSE_ARGV 0 ARG "" "NAME" "SOURCES;PYTHONPATH") 3 | 4 | set(source_files "") 5 | 6 | foreach (src IN LISTS ARG_SOURCES) 7 | cmake_path(ABSOLUTE_PATH src 8 | BASE_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" 9 | NORMALIZE) 10 | list(APPEND source_files "${src}") 11 | endforeach () 12 | 13 | set(intdir "${ARG_NAME}.exo") 14 | set(files "${intdir}/${ARG_NAME}.c" "${intdir}/${ARG_NAME}.h") 15 | 16 | list(TRANSFORM ARG_PYTHONPATH PREPEND "--modify;PYTHONPATH=path_list_append:") 17 | 18 | add_custom_command( 19 | OUTPUT ${files} 20 | COMMAND ${CMAKE_COMMAND} -E env ${ARG_PYTHONPATH} -- 21 | $ -o "${intdir}" --stem "${ARG_NAME}" 22 | ${source_files} 23 | DEPENDS ${source_files} 24 | DEPFILE "${intdir}/${ARG_NAME}.d" 25 | VERBATIM 26 | ) 27 | 28 | list(TRANSFORM files PREPEND "${CMAKE_CURRENT_BINARY_DIR}/") 29 | add_library(${ARG_NAME} ${files}) 30 | add_library(${PROJECT_NAME}::${ARG_NAME} ALIAS ${ARG_NAME}) 31 | target_include_directories(${ARG_NAME} PUBLIC "$") 32 | endfunction() 33 | -------------------------------------------------------------------------------- /src/exo/cmake/ExoConfig.cmake: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.21) 2 | 3 | if (NOT CMAKE_FIND_PACKAGE_NAME STREQUAL "Exo") 4 | message(AUTHOR_WARNING "Found Exo using non-standard name '${CMAKE_FIND_PACKAGE_NAME}'") 5 | endif () 6 | 7 | include(CMakeFindDependencyMacro) 8 | find_dependency(Python 3.9) 9 | 10 | find_program( 11 | Exo_EXECUTABLE exocc 12 | HINTS 13 | "${Python_ROOT_DIR}/bin" 14 | "${Python_ROOT}/bin" 15 | ) 16 | mark_as_advanced(Exo_EXECUTABLE) 17 | 18 | if (NOT Exo_EXECUTABLE) 19 | set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "Could not find exocc!") 20 | set(${CMAKE_FIND_PACKAGE_NAME}_FOUND FALSE) 21 | return() 22 | endif () 23 | 24 | if (NOT TARGET Exo::compiler) 25 | add_executable(Exo::compiler IMPORTED) 26 | set_target_properties(Exo::compiler PROPERTIES IMPORTED_LOCATION "${Exo_EXECUTABLE}") 27 | endif () 28 | 29 | include("${CMAKE_CURRENT_LIST_DIR}/AddExoLibrary.cmake") 30 | 31 | foreach (comp IN LISTS Exo_FIND_COMPONENTS) 32 | if (NOT Exo_${comp}_FOUND AND Exo_FIND_REQUIRED_${comp}) 33 | set(Exo_FOUND FALSE) 34 | endif () 35 | endforeach () 36 | -------------------------------------------------------------------------------- /src/exo/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/src/exo/core/__init__.py -------------------------------------------------------------------------------- /src/exo/core/extern.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | # --------------------------------------------------------------------------- # 4 | # --------------------------------------------------------------------------- # 5 | # Extern superclass 6 | 7 | 8 | class Extern_Typecheck_Error(Exception): 9 | def __init__(self, msg): 10 | self._builtin_err_msg = str(msg) 11 | 12 | def __str__(self): 13 | return self._builtin_err_msg 14 | 15 | 16 | _EErr = Extern_Typecheck_Error 17 | 18 | 19 | class Extern: 20 | def __init__(self, name): 21 | self._name = name 22 | 23 | def name(self): 24 | return self._name 25 | 26 | def globl(self, prim_type): 27 | raise NotImplementedError() 28 | 29 | def typecheck(self, args): 30 | raise NotImplementedError() 31 | 32 | def interpret(self, args): 33 | raise NotImplementedError() 34 | 35 | def compile(self, args, prim_type): 36 | raise NotImplementedError() 37 | -------------------------------------------------------------------------------- /src/exo/frontend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/src/exo/frontend/__init__.py -------------------------------------------------------------------------------- /src/exo/libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/src/exo/libs/__init__.py -------------------------------------------------------------------------------- /src/exo/libs/custom_malloc.h: -------------------------------------------------------------------------------- 1 | #ifndef CUSTOM_MALLOC_H 2 | #define CUSTOM_MALLOC_H 3 | void init_mem(void); 4 | void *malloc_dram(long unsigned int bytes); 5 | void free_dram(void *ptr); 6 | #endif 7 | -------------------------------------------------------------------------------- /src/exo/libs/exo_arm_sve.h: -------------------------------------------------------------------------------- 1 | #ifndef EXO_ARM_SVE_H 2 | #define EXO_ARM_SVE_H 3 | 4 | #include 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | static inline __attribute__((always_inline)) void svmla_n_f32_x_vla( 11 | int64_t N, float32_t *dst, const float32_t *src1, float32_t src2) { 12 | int64_t i = 0; 13 | svbool_t pg = svwhilelt_b32(i, N); 14 | do { 15 | svst1_f32(pg, &dst[i], 16 | svmla_n_f32_x( 17 | pg, svld1_f32(pg, &dst[i]), svld1_f32(pg, &src1[i]), src2)); 18 | i += svcntw(); 19 | pg = svwhilelt_b32(i, N); 20 | } while (svptest_first(svptrue_b32(), pg)); 21 | } 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | 27 | #endif -------------------------------------------------------------------------------- /src/exo/libs/gemm_acc_malloc.h: -------------------------------------------------------------------------------- 1 | #ifndef GEMM_ACC_MALLOC_H 2 | #define GEMM_ACC_MALLOC_H 3 | void gemm_acc_init_mem(void); 4 | uint32_t gemm_acc_malloc(long unsigned int size); 5 | void gemm_acc_free(uint32_t addr); 6 | #endif 7 | -------------------------------------------------------------------------------- /src/exo/libs/gemm_malloc.h: -------------------------------------------------------------------------------- 1 | #ifndef GEMM_MALLOC_H 2 | #define GEMM_MALLOC_H 3 | void gemm_init_mem(void); 4 | uint32_t gemm_malloc(long unsigned int size); 5 | void gemm_free(uint32_t addr); 6 | #endif 7 | -------------------------------------------------------------------------------- /src/exo/platforms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/src/exo/platforms/__init__.py -------------------------------------------------------------------------------- /src/exo/platforms/sve_vla.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from exo import DRAM, instr 4 | 5 | 6 | @instr( 7 | "svmla_n_f32_x_vla({N_data}, &{dst_data}, &{src1_data}, *{src2_data});", 8 | c_global='#include "exo_arm_sve.h"', 9 | ) 10 | def svmla_n_f32_x_vla( 11 | N: size, 12 | dst: [f32][N] @ DRAM, 13 | src1: [f32][N] @ DRAM, 14 | src2: f32, 15 | ): 16 | assert stride(src1, 0) == 1 17 | assert stride(dst, 0) == 1 18 | 19 | for i in seq(0, N): 20 | dst[i] += src1[i] * src2 21 | -------------------------------------------------------------------------------- /src/exo/rewrite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/src/exo/rewrite/__init__.py -------------------------------------------------------------------------------- /src/exo/stdlib/__init__.py: -------------------------------------------------------------------------------- 1 | from . import scheduling 2 | 3 | __all__ = [ 4 | "scheduling", 5 | "stdlib", 6 | ] 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/tests/__init__.py -------------------------------------------------------------------------------- /tests/amx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/tests/amx/__init__.py -------------------------------------------------------------------------------- /tests/golden/asplos25/test_higher_order/test_lrn.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | for i in seq(0, n): 3 | for j in seq(0, n): 4 | tmp_a: i8[n] @ DRAM # <-- NODE 5 | tmp_b: i8[n] @ DRAM 6 | tmp_a[i] = A[i] 7 | tmp_b[i] = A[i] 8 | def bar(n: size, A: i8[n] @ DRAM): 9 | for i in seq(0, n): 10 | for j in seq(0, n): 11 | tmp_a: i8[n] @ DRAM 12 | tmp_b: i8[n] @ DRAM # <-- NODE 13 | tmp_a[i] = A[i] 14 | tmp_b[i] = A[i] 15 | def bar(n: size, A: i8[n] @ DRAM): 16 | for i in seq(0, n): 17 | for j in seq(0, n): 18 | tmp_a: i8[n] @ DRAM 19 | tmp_b: i8[n] @ DRAM 20 | tmp_a[i] = A[i] # <-- NODE 21 | tmp_b[i] = A[i] 22 | def bar(n: size, A: i8[n] @ DRAM): 23 | for i in seq(0, n): 24 | for j in seq(0, n): 25 | tmp_a: i8[n] @ DRAM 26 | tmp_b: i8[n] @ DRAM 27 | tmp_a[i] = A[i] 28 | tmp_b[i] = A[i] # <-- NODE 29 | def bar(n: size, A: i8[n] @ DRAM): 30 | for i in seq(0, n): 31 | for j in seq(0, n): # <-- NODE 32 | tmp_a: i8[n] @ DRAM 33 | tmp_b: i8[n] @ DRAM 34 | tmp_a[i] = A[i] 35 | tmp_b[i] = A[i] 36 | def bar(n: size, A: i8[n] @ DRAM): 37 | for i in seq(0, n): # <-- NODE 38 | for j in seq(0, n): 39 | tmp_a: i8[n] @ DRAM 40 | tmp_b: i8[n] @ DRAM 41 | tmp_a[i] = A[i] 42 | tmp_b[i] = A[i] 43 | -------------------------------------------------------------------------------- /tests/golden/asplos25/test_higher_order/test_reduce.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | for i in seq(0, n): 3 | tmp_b: i8[n] @ DRAM 4 | tmp_a: i8[n] @ DRAM 5 | for j in seq(0, n): 6 | tmp_a[i] = A[i] 7 | tmp_b[i] = A[i] -------------------------------------------------------------------------------- /tests/golden/asplos25/test_higher_order/test_reframe.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | x: R @ DRAM 3 | x = 0.0 4 | for i in seq(0, n): 5 | for j in seq(0, n): 6 | tmp_a: i8[n] @ DRAM 7 | tmp_a[i] = A[i] -------------------------------------------------------------------------------- /tests/golden/asplos25/test_higher_order/test_repeat.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | tmp_a: i8[n] @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, n): 5 | tmp_a[i] = A[i] -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_CIR_USub.txt: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | #ifndef TEST_H 4 | #define TEST_H 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | 11 | #include 12 | #include 13 | 14 | // Compiler feature macros adapted from Hedley (public domain) 15 | // https://github.com/nemequ/hedley 16 | 17 | #if defined(__has_builtin) 18 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 19 | #else 20 | # define EXO_HAS_BUILTIN(builtin) (0) 21 | #endif 22 | 23 | #if EXO_HAS_BUILTIN(__builtin_assume) 24 | # define EXO_ASSUME(expr) __builtin_assume(expr) 25 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 26 | # define EXO_ASSUME(expr) \ 27 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 28 | #else 29 | # define EXO_ASSUME(expr) ((void)(expr)) 30 | #endif 31 | 32 | 33 | 34 | // foo( 35 | // N : size, 36 | // x : f32[N] @DRAM 37 | // ) 38 | void foo( void *ctxt, int_fast32_t N, float* x ); 39 | 40 | 41 | 42 | #ifdef __cplusplus 43 | } 44 | #endif 45 | #endif // TEST_H 46 | 47 | #include "test.h" 48 | 49 | #include 50 | #include 51 | 52 | // foo( 53 | // N : size, 54 | // x : f32[N] @DRAM 55 | // ) 56 | void foo( void *ctxt, int_fast32_t N, float* x ) { 57 | for (int_fast32_t i = 0; i < N; i++) { 58 | x[-i + N - 1] = 0.0f; 59 | } 60 | } 61 | 62 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_coercion_to_f16.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | // foo( 7 | 8 | // ) 9 | void foo( void *ctxt ) { 10 | _Float16 a; 11 | a = a + ((_Float16) 3); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_coercion_to_f32.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | // foo( 7 | 8 | // ) 9 | void foo( void *ctxt ) { 10 | float a; 11 | a = a + 3.0f; 12 | } 13 | 14 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_coercion_to_f64.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | // foo( 7 | 8 | // ) 9 | void foo( void *ctxt ) { 10 | double a; 11 | a = a + 3.0; 12 | } 13 | 14 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_coercion_to_i32.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | // foo( 7 | 8 | // ) 9 | void foo( void *ctxt ) { 10 | int32_t a; 11 | a = a + ((int32_t) 3); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_coercion_to_i8.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | // foo( 7 | 8 | // ) 9 | void foo( void *ctxt ) { 10 | int8_t a; 11 | a = a + ((int8_t) 3); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_coercion_to_index.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | // foo( 4 | 5 | // ) 6 | void foo( void *ctxt ) { 7 | for (int_fast32_t x = 0; x < 6; x++) { 8 | ; // NO-OP 9 | } 10 | } 11 | 12 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_coercion_to_ui16.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | // foo( 7 | 8 | // ) 9 | void foo( void *ctxt ) { 10 | uint16_t a; 11 | a = a + ((uint16_t) 3); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_coercion_to_ui8.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | // foo( 7 | 8 | // ) 9 | void foo( void *ctxt ) { 10 | uint8_t a; 11 | a = a + ((uint8_t) 3); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_no_exo_floor_div_triangular_access.txt: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | #ifndef TEST_H 4 | #define TEST_H 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | 11 | #include 12 | #include 13 | 14 | // Compiler feature macros adapted from Hedley (public domain) 15 | // https://github.com/nemequ/hedley 16 | 17 | #if defined(__has_builtin) 18 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 19 | #else 20 | # define EXO_HAS_BUILTIN(builtin) (0) 21 | #endif 22 | 23 | #if EXO_HAS_BUILTIN(__builtin_assume) 24 | # define EXO_ASSUME(expr) __builtin_assume(expr) 25 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 26 | # define EXO_ASSUME(expr) \ 27 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 28 | #else 29 | # define EXO_ASSUME(expr) ((void)(expr)) 30 | #endif 31 | 32 | 33 | 34 | // foo( 35 | // N : size, 36 | // x : f32[N, N] @DRAM 37 | // ) 38 | void foo( void *ctxt, int_fast32_t N, float* x ); 39 | 40 | 41 | 42 | #ifdef __cplusplus 43 | } 44 | #endif 45 | #endif // TEST_H 46 | 47 | #include "test.h" 48 | 49 | #include 50 | #include 51 | 52 | // foo( 53 | // N : size, 54 | // x : f32[N, N] @DRAM 55 | // ) 56 | void foo( void *ctxt, int_fast32_t N, float* x ) { 57 | for (int_fast32_t ii = 0; ii < N % 4; ii++) { 58 | for (int_fast32_t joo = 0; joo < ((ii + ((N) / (4)) * 4) / (16)); joo++) { 59 | x[ii * N + joo] = 0.0f; 60 | } 61 | } 62 | } 63 | 64 | -------------------------------------------------------------------------------- /tests/golden/test_codegen/test_pragma_parallel_loop.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | // foo( 10 | // x : i8[10] @DRAM 11 | // ) 12 | void foo( void *ctxt, int8_t* x ) { 13 | #pragma omp parallel for 14 | for (int_fast32_t i = 0; i < 10; i++) { 15 | int8_t y[10]; 16 | x[i] = y[i]; 17 | } 18 | } 19 | 20 | -------------------------------------------------------------------------------- /tests/golden/test_config/test_basic_config.txt: -------------------------------------------------------------------------------- 1 | def foo(x: f32 @ DRAM): 2 | ConfigAB.a = 32.0 3 | x = ConfigAB.a -------------------------------------------------------------------------------- /tests/golden/test_config/test_config_bind.txt: -------------------------------------------------------------------------------- 1 | def foo(scale: f32 @ DRAM): 2 | for i in seq(0, 10): 3 | tmp: f32 @ DRAM 4 | tmp = 0.0 5 | ConfigLoad.scale = scale 6 | tmp = tmp * ConfigLoad.scale -------------------------------------------------------------------------------- /tests/golden/test_config/test_config_fission.txt: -------------------------------------------------------------------------------- 1 | def foo(scale: f32 @ DRAM, n: size, m: size, A: f32[n, m] @ DRAM): 2 | ConfigLoad.scale = scale 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | tmp: f32 @ DRAM 6 | tmp = A[i, j] 7 | tmp = tmp * ConfigLoad.scale -------------------------------------------------------------------------------- /tests/golden/test_config/test_config_write.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | tmp: f32 @ DRAM 3 | tmp = 0.0 4 | Config.tmp = tmp -------------------------------------------------------------------------------- /tests/golden/test_config/test_ld.txt: -------------------------------------------------------------------------------- 1 | def config_ld_i8(scale: f32 @ DRAM, src_stride: stride @ DRAM): 2 | # @instr gemmini_extended3_config_ld({src_stride}, {scale}[0], 0, 0); 3 | # 4 | ConfigLoad.scale = scale 5 | ConfigLoad.src_stride = src_stride 6 | def ld_i8(n: size, m: size, scale: f32 @ DRAM, src: i8[n, m] @ DRAM, 7 | dst: i8[n, 16] @ GEMM_SCRATCH): 8 | assert n <= 16 9 | assert m <= 16 10 | assert stride(src, 1) == 1 11 | assert stride(dst, 0) == 16 12 | assert stride(dst, 1) == 1 13 | config_ld_i8(scale, stride(src, 0)) 14 | do_ld_i8(n + 0, m + 0, src, dst) -------------------------------------------------------------------------------- /tests/golden/test_config/test_loop_complex_guards.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | for i in seq(0, n): 3 | if ConfigControl.i == 3: 4 | ConfigControl.i = 4 5 | if n == n - 1: 6 | ConfigControl.i = 3 -------------------------------------------------------------------------------- /tests/golden/test_config/test_stride_with_config.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, src: [i8][n] @ DRAM): 2 | assert stride(src, 0) == ConfigLoad.src_stride 3 | pass 4 | def foo(n: size, src: [i8][n] @ DRAM): 5 | assert stride(src, 0) == ConfigLoad.src_stride 6 | bar(n, src) -------------------------------------------------------------------------------- /tests/golden/test_config/test_write_all_control.txt: -------------------------------------------------------------------------------- 1 | def set_all(i: index, s: stride @ DRAM, b: bool @ DRAM): 2 | ConfigControl.i = i 3 | ConfigControl.s = s 4 | ConfigControl.b = b -------------------------------------------------------------------------------- /tests/golden/test_config/test_write_loop_builtin.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | for i in seq(0, n): 3 | ConfigAB.a = sin(1.0) -------------------------------------------------------------------------------- /tests/golden/test_config/test_write_loop_const_number.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | for i in seq(0, n): 3 | ConfigAB.a = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_arg_cursor.txt: -------------------------------------------------------------------------------- 1 | n, False 2 | alpha, False 3 | x, True, n, n 4 | -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_basic_forwarding.txt: -------------------------------------------------------------------------------- 1 | def p(): 2 | x: f32 @ DRAM 3 | pass 4 | if True: 5 | x = 1.0 6 | x = 2.0 -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_basic_forwarding2.txt: -------------------------------------------------------------------------------- 1 | def filter1D(ow: size, kw: size, x: f32[ow + kw - 1] @ DRAM, y: f32[ow] @ DRAM, 2 | w: f32[kw] @ DRAM): 3 | for outXo in seq(0, ow / 4): 4 | sum: f32[4] @ DRAM # <-- NODE 5 | for outXi in seq(0, 4): 6 | sum[outXi] = 0.0 7 | for k in seq(0, kw): 8 | sum[outXi] += x[4 * outXo + outXi + k] * w[k] 9 | y[4 * outXo + outXi] = sum[outXi] 10 | if ow % 4 > 0: 11 | for outXi in seq(0, ow % 4): 12 | sum: f32 @ DRAM 13 | sum = 0.0 14 | for k in seq(0, kw): 15 | sum += x[outXi + ow / 4 * 4 + k] * w[k] 16 | y[outXi + ow / 4 * 4] = sum -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_basic_forwarding3.txt: -------------------------------------------------------------------------------- 1 | def filter1D(ow: size, kw: size, x: f32[ow + kw - 1] @ DRAM, y: f32[ow] @ DRAM, 2 | w: f32[kw] @ DRAM): 3 | for outXo in seq(0, ow / 4): 4 | sum: f32[4] @ DRAM # <-- NODE 5 | for outXi in seq(0, 4): 6 | sum[outXi] = 0.0 7 | for k in seq(0, kw): 8 | sum[outXi] += x[4 * outXo + outXi + k] * w[k] 9 | y[4 * outXo + outXi] = sum[outXi] 10 | if ow % 4 > 0: 11 | for outXi in seq(0, ow % 4): 12 | sum: f32 @ DRAM 13 | sum = 0.0 14 | for k in seq(0, kw): 15 | sum += x[outXi + ow / 4 * 4 + k] * w[k] 16 | y[outXi + ow / 4 * 4] = sum -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_bind_expr_forwarding.txt: -------------------------------------------------------------------------------- 1 | def scal(n: size, alpha: R @ DRAM, x: [R][n] @ DRAM): 2 | for io in seq(0, n / 8): 3 | for ii in seq(0, 8): 4 | alphaReg: R @ DRAM 5 | alphaReg = alpha 6 | x[8 * io + ii] = alphaReg * x[8 * io + ii] 7 | for ii in seq(0, n % 8): 8 | x[ii + n / 8 * 8] = alpha * x[ii + n / 8 * 8] -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_gap_forwarding.txt: -------------------------------------------------------------------------------- 1 | def p(): 2 | x: f32 @ DRAM 3 | pass 4 | if True: 5 | x = 1.0 6 | pass 7 | x = 2.0 8 | pass -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_get_enclosing_loop_by_name.txt: -------------------------------------------------------------------------------- 1 | def foo(x: i8 @ DRAM): 2 | for i in seq(0, 5): 3 | for j in seq(0, 5): # <-- NODE 4 | if i == 0: 5 | x = 1.0 6 | 7 | def foo(x: i8 @ DRAM): 8 | for i in seq(0, 5): # <-- NODE 9 | for j in seq(0, 5): 10 | if i == 0: 11 | x = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_get_stmt_within_scope.txt: -------------------------------------------------------------------------------- 1 | def foo(x: i8 @ DRAM): 2 | for i in seq(0, 8): 3 | if i + 3 < -1: # <-- NODE 4 | ... -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_match_parent.txt: -------------------------------------------------------------------------------- 1 | def foo(x: i8 @ DRAM): 2 | for i in seq(0, 8): 3 | if i + 3 < -1: 4 | x = 0.0 5 | pass 6 | for i in seq(0, 2): # <-- NODE 7 | x = 1.0 8 | 9 | def foo(x: i8 @ DRAM): 10 | for i in seq(0, 8): # <-- NODE 11 | if i + 3 < -1: 12 | x = 0.0 13 | pass 14 | for i in seq(0, 2): 15 | x = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_match_parent_2.txt: -------------------------------------------------------------------------------- 1 | def foo(x: i8 @ DRAM): 2 | for i in seq(0, 8): # <-- NODE 3 | x = 1.0 4 | for j in seq(0, 2): 5 | x = 2.0 6 | 7 | def foo(x: i8 @ DRAM): 8 | for i in seq(0, 8): 9 | x = 1.0 10 | for j in seq(0, 2): # <-- NODE 11 | x = 2.0 -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_reorder_loops_forwarding.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | for j in seq(0, 4): 3 | for k in seq(0, 4): 4 | for i in seq(0, 4): 5 | x: i8 @ DRAM -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_simplify_forwarding.txt: -------------------------------------------------------------------------------- 1 | y[1] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_unroll_buffer_forwarding.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | src_0: i32 @ DRAM 3 | src_1: i32 @ DRAM 4 | src_0 = 1.0 # <-- NODE 5 | src_1 = 1.0 6 | def foo(): 7 | src_0: i32 @ DRAM 8 | src_1: i32 @ DRAM 9 | src_0 = 1.0 10 | src_1 = 1.0 # <-- NODE 11 | def foo(): 12 | src_0: i32 @ DRAM 13 | src_1: i32 @ DRAM 14 | src_0 = 1.0 # <-- NODE 15 | src_1 = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_cursors/test_vectorize_forwarding.txt: -------------------------------------------------------------------------------- 1 | def scal(n: size, alpha: R @ DRAM, x: [R][n] @ DRAM): 2 | for io in seq(0, n / 8): 3 | alphaReg: R[8] @ DRAM 4 | for ii in seq(0, 8): 5 | alphaReg[ii] = alpha 6 | for ii in seq(0, 8): 7 | x[8 * io + ii] = alphaReg[ii] * x[8 * io + ii] # <-- NODE 8 | for ii in seq(0, n % 8): 9 | x[ii + n / 8 * 8] = alpha * x[ii + n / 8 * 8] -------------------------------------------------------------------------------- /tests/golden/test_externs/test_expf.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | // foo( 8 | // x : i8[16] @DRAM, 9 | // y : i8[16] @DRAM 10 | // ) 11 | void foo( void *ctxt, const int8_t* x, int8_t* y ) { 12 | for (int_fast32_t i = 0; i < 16; i++) { 13 | y[i] = expf((int8_t)(x[i] + y[i])); 14 | } 15 | } 16 | 17 | 18 | #pragma once 19 | #ifndef TEST_H 20 | #define TEST_H 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | 27 | #include 28 | #include 29 | 30 | // Compiler feature macros adapted from Hedley (public domain) 31 | // https://github.com/nemequ/hedley 32 | 33 | #if defined(__has_builtin) 34 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 35 | #else 36 | # define EXO_HAS_BUILTIN(builtin) (0) 37 | #endif 38 | 39 | #if EXO_HAS_BUILTIN(__builtin_assume) 40 | # define EXO_ASSUME(expr) __builtin_assume(expr) 41 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 42 | # define EXO_ASSUME(expr) \ 43 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 44 | #else 45 | # define EXO_ASSUME(expr) ((void)(expr)) 46 | #endif 47 | 48 | 49 | 50 | // foo( 51 | // x : i8[16] @DRAM, 52 | // y : i8[16] @DRAM 53 | // ) 54 | void foo( void *ctxt, const int8_t* x, int8_t* y ); 55 | 56 | 57 | 58 | #ifdef __cplusplus 59 | } 60 | #endif 61 | #endif // TEST_H 62 | -------------------------------------------------------------------------------- /tests/golden/test_externs/test_extern_find.txt: -------------------------------------------------------------------------------- 1 | def foo(a: f32 @ DRAM): 2 | a = sin(a) # <-- NODE -------------------------------------------------------------------------------- /tests/golden/test_externs/test_fmaxf.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | // foo( 8 | // x : f32[16] @DRAM, 9 | // y : f32[16] @DRAM 10 | // ) 11 | void foo( void *ctxt, const float* x, float* y ) { 12 | for (int_fast32_t i = 0; i < 16; i++) { 13 | y[i] = fmaxf((float)(x[i]), (float)(y[i] * 2.0f)); 14 | } 15 | } 16 | 17 | 18 | #pragma once 19 | #ifndef TEST_H 20 | #define TEST_H 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | 27 | #include 28 | #include 29 | 30 | // Compiler feature macros adapted from Hedley (public domain) 31 | // https://github.com/nemequ/hedley 32 | 33 | #if defined(__has_builtin) 34 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 35 | #else 36 | # define EXO_HAS_BUILTIN(builtin) (0) 37 | #endif 38 | 39 | #if EXO_HAS_BUILTIN(__builtin_assume) 40 | # define EXO_ASSUME(expr) __builtin_assume(expr) 41 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 42 | # define EXO_ASSUME(expr) \ 43 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 44 | #else 45 | # define EXO_ASSUME(expr) ((void)(expr)) 46 | #endif 47 | 48 | 49 | 50 | // foo( 51 | // x : f32[16] @DRAM, 52 | // y : f32[16] @DRAM 53 | // ) 54 | void foo( void *ctxt, const float* x, float* y ); 55 | 56 | 57 | 58 | #ifdef __cplusplus 59 | } 60 | #endif 61 | #endif // TEST_H 62 | -------------------------------------------------------------------------------- /tests/golden/test_externs/test_relu.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | float _relu_float(float x) { 7 | if (x > 0.0) return x; 8 | else return 0.0; 9 | } 10 | 11 | // foo( 12 | // x : f32[16] @DRAM 13 | // ) 14 | void foo( void *ctxt, float* x ) { 15 | for (int_fast32_t i = 0; i < 16; i++) { 16 | x[i] = _relu_float((float)3.0f); 17 | } 18 | } 19 | 20 | 21 | #pragma once 22 | #ifndef TEST_H 23 | #define TEST_H 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | 30 | #include 31 | #include 32 | 33 | // Compiler feature macros adapted from Hedley (public domain) 34 | // https://github.com/nemequ/hedley 35 | 36 | #if defined(__has_builtin) 37 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 38 | #else 39 | # define EXO_HAS_BUILTIN(builtin) (0) 40 | #endif 41 | 42 | #if EXO_HAS_BUILTIN(__builtin_assume) 43 | # define EXO_ASSUME(expr) __builtin_assume(expr) 44 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 45 | # define EXO_ASSUME(expr) \ 46 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 47 | #else 48 | # define EXO_ASSUME(expr) ((void)(expr)) 49 | #endif 50 | 51 | 52 | 53 | // foo( 54 | // x : f32[16] @DRAM 55 | // ) 56 | void foo( void *ctxt, float* x ); 57 | 58 | 59 | 60 | #ifdef __cplusplus 61 | } 62 | #endif 63 | #endif // TEST_H 64 | -------------------------------------------------------------------------------- /tests/golden/test_externs/test_relu2.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | float _relu_float(float x) { 7 | if (x > 0.0) return x; 8 | else return 0.0; 9 | } 10 | 11 | // foo( 12 | // x : f32[16] @DRAM 13 | // ) 14 | void foo( void *ctxt, float* x ) { 15 | for (int_fast32_t i = 0; i < 16; i++) { 16 | x[i] = _relu_float((float)x[i]); 17 | } 18 | } 19 | 20 | 21 | #pragma once 22 | #ifndef TEST_H 23 | #define TEST_H 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | 30 | #include 31 | #include 32 | 33 | // Compiler feature macros adapted from Hedley (public domain) 34 | // https://github.com/nemequ/hedley 35 | 36 | #if defined(__has_builtin) 37 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 38 | #else 39 | # define EXO_HAS_BUILTIN(builtin) (0) 40 | #endif 41 | 42 | #if EXO_HAS_BUILTIN(__builtin_assume) 43 | # define EXO_ASSUME(expr) __builtin_assume(expr) 44 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 45 | # define EXO_ASSUME(expr) \ 46 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 47 | #else 48 | # define EXO_ASSUME(expr) ((void)(expr)) 49 | #endif 50 | 51 | 52 | 53 | // foo( 54 | // x : f32[16] @DRAM 55 | // ) 56 | void foo( void *ctxt, float* x ); 57 | 58 | 59 | 60 | #ifdef __cplusplus 61 | } 62 | #endif 63 | #endif // TEST_H 64 | -------------------------------------------------------------------------------- /tests/golden/test_externs/test_relu4.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | int8_t _relu_int8_t(int8_t x) { 7 | if (x > 0.0) return x; 8 | else return 0.0; 9 | } 10 | 11 | // foo( 12 | // x : i8[16] @DRAM 13 | // ) 14 | void foo( void *ctxt, int8_t* x ) { 15 | for (int_fast32_t i = 0; i < 16; i++) { 16 | x[i] = _relu_int8_t((int8_t)((int8_t) 3.0)); 17 | } 18 | } 19 | 20 | 21 | #pragma once 22 | #ifndef TEST_H 23 | #define TEST_H 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | 30 | #include 31 | #include 32 | 33 | // Compiler feature macros adapted from Hedley (public domain) 34 | // https://github.com/nemequ/hedley 35 | 36 | #if defined(__has_builtin) 37 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 38 | #else 39 | # define EXO_HAS_BUILTIN(builtin) (0) 40 | #endif 41 | 42 | #if EXO_HAS_BUILTIN(__builtin_assume) 43 | # define EXO_ASSUME(expr) __builtin_assume(expr) 44 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 45 | # define EXO_ASSUME(expr) \ 46 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 47 | #else 48 | # define EXO_ASSUME(expr) ((void)(expr)) 49 | #endif 50 | 51 | 52 | 53 | // foo( 54 | // x : i8[16] @DRAM 55 | // ) 56 | void foo( void *ctxt, int8_t* x ); 57 | 58 | 59 | 60 | #ifdef __cplusplus 61 | } 62 | #endif 63 | #endif // TEST_H 64 | -------------------------------------------------------------------------------- /tests/golden/test_externs/test_sigmoid.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | 7 | #include 8 | float sigmoid(float x) { 9 | return 1 / (1 + exp(-x)); 10 | } 11 | 12 | // foo( 13 | // x : f32[16] @DRAM, 14 | // y : f32[16] @DRAM 15 | // ) 16 | void foo( void *ctxt, const float* x, float* y ) { 17 | for (int_fast32_t i = 0; i < 16; i++) { 18 | y[i] = sigmoid((float)(x[i] + y[i])); 19 | } 20 | } 21 | 22 | 23 | #pragma once 24 | #ifndef TEST_H 25 | #define TEST_H 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | 32 | #include 33 | #include 34 | 35 | // Compiler feature macros adapted from Hedley (public domain) 36 | // https://github.com/nemequ/hedley 37 | 38 | #if defined(__has_builtin) 39 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 40 | #else 41 | # define EXO_HAS_BUILTIN(builtin) (0) 42 | #endif 43 | 44 | #if EXO_HAS_BUILTIN(__builtin_assume) 45 | # define EXO_ASSUME(expr) __builtin_assume(expr) 46 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 47 | # define EXO_ASSUME(expr) \ 48 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 49 | #else 50 | # define EXO_ASSUME(expr) ((void)(expr)) 51 | #endif 52 | 53 | 54 | 55 | // foo( 56 | // x : f32[16] @DRAM, 57 | // y : f32[16] @DRAM 58 | // ) 59 | void foo( void *ctxt, const float* x, float* y ); 60 | 61 | 62 | 63 | #ifdef __cplusplus 64 | } 65 | #endif 66 | #endif // TEST_H 67 | -------------------------------------------------------------------------------- /tests/golden/test_externs/test_sin.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | // foo( 8 | // x : i8[16] @DRAM 9 | // ) 10 | void foo( void *ctxt, int8_t* x ) { 11 | for (int_fast32_t i = 0; i < 16; i++) { 12 | x[i] = sin((int8_t)x[i] * ((int8_t) 2)); 13 | } 14 | } 15 | 16 | 17 | #pragma once 18 | #ifndef TEST_H 19 | #define TEST_H 20 | 21 | #ifdef __cplusplus 22 | extern "C" { 23 | #endif 24 | 25 | 26 | #include 27 | #include 28 | 29 | // Compiler feature macros adapted from Hedley (public domain) 30 | // https://github.com/nemequ/hedley 31 | 32 | #if defined(__has_builtin) 33 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 34 | #else 35 | # define EXO_HAS_BUILTIN(builtin) (0) 36 | #endif 37 | 38 | #if EXO_HAS_BUILTIN(__builtin_assume) 39 | # define EXO_ASSUME(expr) __builtin_assume(expr) 40 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 41 | # define EXO_ASSUME(expr) \ 42 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 43 | #else 44 | # define EXO_ASSUME(expr) ((void)(expr)) 45 | #endif 46 | 47 | 48 | 49 | // foo( 50 | // x : i8[16] @DRAM 51 | // ) 52 | void foo( void *ctxt, int8_t* x ); 53 | 54 | 55 | 56 | #ifdef __cplusplus 57 | } 58 | #endif 59 | #endif // TEST_H 60 | -------------------------------------------------------------------------------- /tests/golden/test_externs/test_sqrt.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | // foo( 8 | // x : f32[16] @DRAM, 9 | // y : f32[16] @DRAM 10 | // ) 11 | void foo( void *ctxt, const float* x, float* y ) { 12 | for (int_fast32_t i = 0; i < 16; i++) { 13 | y[i] = sqrt((float)(x[i] + y[i])); 14 | } 15 | } 16 | 17 | 18 | #pragma once 19 | #ifndef TEST_H 20 | #define TEST_H 21 | 22 | #ifdef __cplusplus 23 | extern "C" { 24 | #endif 25 | 26 | 27 | #include 28 | #include 29 | 30 | // Compiler feature macros adapted from Hedley (public domain) 31 | // https://github.com/nemequ/hedley 32 | 33 | #if defined(__has_builtin) 34 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 35 | #else 36 | # define EXO_HAS_BUILTIN(builtin) (0) 37 | #endif 38 | 39 | #if EXO_HAS_BUILTIN(__builtin_assume) 40 | # define EXO_ASSUME(expr) __builtin_assume(expr) 41 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 42 | # define EXO_ASSUME(expr) \ 43 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 44 | #else 45 | # define EXO_ASSUME(expr) ((void)(expr)) 46 | #endif 47 | 48 | 49 | 50 | // foo( 51 | // x : f32[16] @DRAM, 52 | // y : f32[16] @DRAM 53 | // ) 54 | void foo( void *ctxt, const float* x, float* y ); 55 | 56 | 57 | 58 | #ifdef __cplusplus 59 | } 60 | #endif 61 | #endif // TEST_H 62 | -------------------------------------------------------------------------------- /tests/golden/test_halide_ops/test_compute_at_with_prologue.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | producer: i8[11, 11] @ DRAM 3 | consumer: i8[10, 10] @ DRAM 4 | for y in seq(0, 1): 5 | for x in seq(0, 11): 6 | producer[y, x] = 1.0 7 | for y in seq(0, 10): 8 | for x in seq(0, 1): 9 | producer[1 + y, x] = 1.0 10 | for x in seq(0, 10): 11 | producer[1 + y, 1 + x] = 1.0 12 | consumer[y, x] = producer[y, x] + producer[y, 1 + x] + producer[ 13 | 1 + y, x] + producer[1 + y, 1 + x] -------------------------------------------------------------------------------- /tests/golden/test_halide_ops/test_schedule_blur1d.txt: -------------------------------------------------------------------------------- 1 | def blur1d_compute_at_store_root(n: size, consumer: i8[n] @ DRAM, 2 | inp: i8[n + 6] @ DRAM): 3 | producer: i8[1 + n] @ DRAM 4 | for i in seq(0, n): 5 | for ii in seq(0, 2): 6 | producer[i + ii] = (inp[i + ii] + inp[1 + i + ii] + 7 | inp[2 + i + ii] + inp[3 + i + ii] + 8 | inp[4 + i + ii] + inp[5 + i + ii]) / 6.0 9 | consumer[i] = (producer[i] + producer[1 + i]) / 2.0 10 | 11 | def blur1d_compute_at(n: size, consumer: i8[n] @ DRAM, inp: i8[n + 6] @ DRAM): 12 | for i in seq(0, n): 13 | producer: i8[2] @ DRAM 14 | for ii in seq(0, 2): 15 | producer[ii] = (inp[i + ii] + inp[1 + i + ii] + inp[2 + i + ii] + 16 | inp[3 + i + ii] + inp[4 + i + ii] + 17 | inp[5 + i + ii]) / 6.0 18 | consumer[i] = (producer[0] + producer[1]) / 2.0 19 | 20 | def blur1d_inline(n: size, consumer: i8[n] @ DRAM, inp: i8[n + 6] @ DRAM): 21 | for i in seq(0, n): 22 | consumer[i] = ((inp[i] + inp[1 + i] + inp[2 + i] + inp[3 + i] + 23 | inp[4 + i] + inp[5 + i]) / 6.0 + 24 | (inp[1 + i] + inp[2 + i] + inp[3 + i] + inp[4 + i] + 25 | inp[5 + i] + inp[6 + i]) / 6.0) / 2.0 -------------------------------------------------------------------------------- /tests/golden/test_im2col/test_im2col.txt: -------------------------------------------------------------------------------- 1 | def im2col(C: size, W: size, R: size, x: R[C, W] @ DRAM, 2 | y: R[C + 1, R + 1, W + 1] @ DRAM): 3 | for c in seq(0, C): 4 | for r in seq(0, R): 5 | for i in seq(0, W): 6 | if 0 <= i - r: 7 | y[c, r, i] = x[c, i - r] 8 | def matmul(K: size, C: size, W: size, R: size, w: R[K, C, R] @ DRAM, 9 | res: R[K, W] @ DRAM, y: R[C + 1, R + 1, W + 1] @ DRAM): 10 | for k in seq(0, K): 11 | for c in seq(0, C): 12 | for r in seq(0, R): 13 | for i in seq(0, W): 14 | if 0 <= i - r: 15 | res[k, i] += w[k, c, r] * y[c, r, i] 16 | def im2col_conv(K: size, C: size, W: size, R: size, w: R[K, C, R] @ DRAM, 17 | x: R[C, W] @ DRAM, res: R[K, W] @ DRAM): 18 | for k_init in seq(0, K): 19 | for i_init in seq(0, W): 20 | res[k_init, i_init] = 0.0 21 | y: R[C + 1, R + 1, W + 1] @ DRAM 22 | im2col(C, W, R, x, y) 23 | tiled_matmul(K, C, W, R, w, res, y) -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_block_delete.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size): 2 | x: f32 @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | x = 0.0 6 | x = 4.0 7 | x = 5.0 -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_block_delete_whole_block.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size): 2 | x: f32 @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | pass -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_block_replace.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size): 2 | x: f32 @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | x = 0.0 6 | pass 7 | x = 4.0 8 | x = 5.0 -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_cursor_pretty_print_nodes.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size): # <-- NODE 2 | x: f32 @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | x = 0.0 6 | x = 1.0 7 | x = 2.0 8 | x = 3.0 9 | x = 4.0 10 | x = 5.0 11 | 12 | def bar(n: size, m: size): 13 | x: f32 @ DRAM 14 | for i in seq(0, n): # <-- NODE 15 | for j in seq(0, m): 16 | x = 0.0 17 | x = 1.0 18 | x = 2.0 19 | x = 3.0 20 | x = 4.0 21 | x = 5.0 22 | 23 | def bar(n: size, m: size): 24 | x: f32 @ DRAM 25 | for i in seq(0, n): 26 | for j in seq(0, m): # <-- NODE 27 | x = 0.0 28 | x = 1.0 29 | x = 2.0 30 | x = 3.0 31 | x = 4.0 32 | x = 5.0 33 | 34 | def bar(n: size, m: size): 35 | x: f32 @ DRAM 36 | for i in seq(0, n): 37 | for j in seq(0, m): 38 | x = 0.0 # <-- NODE 39 | x = 1.0 40 | x = 2.0 41 | x = 3.0 42 | x = 4.0 43 | x = 5.0 44 | 45 | def bar(n: size, m: size): 46 | x: f32 @ DRAM 47 | for i in seq(0, n): 48 | for j in seq(0, m): 49 | x = 0.0 50 | x = 1.0 51 | x = 2.0 # <-- NODE 52 | x = 3.0 53 | x = 4.0 54 | x = 5.0 -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_cursor_replace_expr.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | for i in seq(0, n): 3 | for j in seq(0, 42): 4 | x: f32 @ DRAM 5 | x = 0.0 6 | y: f32 @ DRAM 7 | y = 1.1 -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_cursor_replace_expr_deep.txt: -------------------------------------------------------------------------------- 1 | def example(): 2 | x: f32 @ DRAM 3 | x = 1.0 * (4.0 + 3.0) -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_double_insert_forwarding.txt: -------------------------------------------------------------------------------- 1 | def proc_s1(): 2 | x: f32 @ DRAM 3 | if 1 < 2: 4 | x = 1.0 5 | x = 2.0 6 | else: 7 | x = 3.0 8 | x = 4.0 -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_gap_insert_pass.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | for i in seq(0, n): 3 | for j in seq(0, m): 4 | x: f32 @ DRAM 5 | x = 0.0 6 | pass 7 | y: f32 @ DRAM 8 | y = 1.1 -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_insert_root_end.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | for i in seq(0, n): 3 | for j in seq(0, m): 4 | x: f32 @ DRAM 5 | x = 0.0 6 | y: f32 @ DRAM 7 | y = 1.1 8 | pass -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_insert_root_front.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | pass 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | x: f32 @ DRAM 6 | x = 0.0 7 | y: f32 @ DRAM 8 | y = 1.1 -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_move_forward_diff_scopes_1.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | z: i8 @ DRAM 3 | for i in seq(0, 4): 4 | x: i8 @ DRAM 5 | y: i8 @ DRAM 6 | pass -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_move_forward_if_orelse.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | if True: 3 | pass 4 | else: 5 | y: i8 @ DRAM 6 | x: i8 @ DRAM -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_move_forwarding_for_blocks_gap_after.txt: -------------------------------------------------------------------------------- 1 | def baz(n: size, m: size): 2 | for i in seq(0, n): 3 | for j in seq(0, m): # <-- NODE 4 | x: f32 @ DRAM 5 | x = 0.0 6 | for k in seq(0, n): 7 | pass 8 | y: f32 @ DRAM 9 | y = 1.1 10 | pass 11 | 12 | def baz(n: size, m: size): 13 | for i in seq(0, n): 14 | for j in seq(0, m): 15 | # BLOCK START 16 | x: f32 @ DRAM 17 | x = 0.0 18 | for k in seq(0, n): 19 | pass 20 | y: f32 @ DRAM 21 | y = 1.1 22 | pass 23 | # BLOCK END 24 | 25 | def baz(n: size, m: size): 26 | for i in seq(0, n): 27 | for j in seq(0, m): 28 | x: f32 @ DRAM 29 | x = 0.0 30 | for k in seq(0, n): 31 | # BLOCK START 32 | pass 33 | y: f32 @ DRAM 34 | y = 1.1 35 | pass 36 | # BLOCK END 37 | 38 | def baz(n: size, m: size): 39 | for i in seq(0, n): 40 | for j in seq(0, m): 41 | x: f32 @ DRAM 42 | x = 0.0 43 | for k in seq(0, n): 44 | pass 45 | # BLOCK START 46 | y: f32 @ DRAM 47 | y = 1.1 48 | # BLOCK END 49 | pass -------------------------------------------------------------------------------- /tests/golden/test_internal_cursors/test_node_replace.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size): 2 | x: f32 @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | x = 0.0 6 | x = 1.0 7 | x = 2.0 8 | pass 9 | x = 4.0 10 | x = 5.0 -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_capture_nested_quote.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM): 3 | a = a 4 | a = a 5 | a = a 6 | C: 7 | #include "test.h" 8 | 9 | #include 10 | #include 11 | 12 | // foo( 13 | // a : i32 @DRAM 14 | // ) 15 | void foo( void *ctxt, int32_t* a ) { 16 | *a = *a; 17 | *a = *a; 18 | *a = *a; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_captured_closure.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def bar(a: i32 @ DRAM): 3 | a += 1 4 | a += 2 5 | a += 3 6 | a += 4 7 | a += 5 8 | a += 6 9 | a += 7 10 | a += 8 11 | a += 9 12 | a += 10 13 | C: 14 | #include "test.h" 15 | 16 | #include 17 | #include 18 | 19 | // bar( 20 | // a : i32 @DRAM 21 | // ) 22 | void bar( void *ctxt, int32_t* a ) { 23 | *a += ((int32_t) 1); 24 | *a += ((int32_t) 2); 25 | *a += ((int32_t) 3); 26 | *a += ((int32_t) 4); 27 | *a += ((int32_t) 5); 28 | *a += ((int32_t) 6); 29 | *a += ((int32_t) 7); 30 | *a += ((int32_t) 8); 31 | *a += ((int32_t) 9); 32 | *a += ((int32_t) 10); 33 | } 34 | 35 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_conditional.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def bar1(a: i8 @ DRAM): 3 | b: i8 @ DRAM 4 | b += 1 5 | def bar2(a: i8 @ DRAM): 6 | b: i8 @ DRAM 7 | b = 0 8 | C: 9 | #include "test.h" 10 | 11 | #include 12 | #include 13 | 14 | // bar1( 15 | // a : i8 @DRAM 16 | // ) 17 | void bar1( void *ctxt, const int8_t* a ) { 18 | int8_t b; 19 | b += ((int8_t) 1); 20 | } 21 | 22 | // bar2( 23 | // a : i8 @DRAM 24 | // ) 25 | void bar2( void *ctxt, const int8_t* a ) { 26 | int8_t b; 27 | b = ((int8_t) 0); 28 | } 29 | 30 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_constant_lifting.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: f64 @ DRAM): 3 | a = 2.0818897486445276 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : f64 @DRAM 12 | // ) 13 | void foo( void *ctxt, double* a ) { 14 | *a = 2.0818897486445276; 15 | } 16 | 17 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_eval_expr_in_mem.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: f32 @ DRAM): 3 | pass 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : f32 @DRAM 12 | // ) 13 | void foo( void *ctxt, const float* a ) { 14 | ; // NO-OP 15 | } 16 | 17 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_implicit_lhs_unquote.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM, b: i32 @ DRAM): 3 | a += 1 4 | b += 1 5 | C: 6 | #include "test.h" 7 | 8 | #include 9 | #include 10 | 11 | // foo( 12 | // a : i32 @DRAM, 13 | // b : i32 @DRAM 14 | // ) 15 | void foo( void *ctxt, int32_t* a, int32_t* b ) { 16 | *a += ((int32_t) 1); 17 | *b += ((int32_t) 1); 18 | } 19 | 20 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_local_externs.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: f64 @ DRAM): 3 | a = sin(a) 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | #include 11 | // foo( 12 | // a : f64 @DRAM 13 | // ) 14 | void foo( void *ctxt, double* a ) { 15 | *a = sin((double)*a); 16 | } 17 | 18 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_proc_shadowing.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: f32 @ DRAM): 3 | sin(a) 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // sin( 11 | // a : f32 @DRAM 12 | // ) 13 | static void sin( void *ctxt, float* a ); 14 | 15 | // foo( 16 | // a : f32 @DRAM 17 | // ) 18 | void foo( void *ctxt, float* a ) { 19 | sin(ctxt,a); 20 | } 21 | 22 | // sin( 23 | // a : f32 @DRAM 24 | // ) 25 | static void sin( void *ctxt, float* a ) { 26 | *a = 0.0f; 27 | } 28 | 29 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_quote_complex_expr.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM): 3 | a = a + 1 + 1 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : i32 @DRAM 12 | // ) 13 | void foo( void *ctxt, int32_t* a ) { 14 | *a = *a + ((int32_t) 1) + ((int32_t) 1); 15 | } 16 | 17 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_quote_elision.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM, b: i32 @ DRAM): 3 | b = a 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : i32 @DRAM, 12 | // b : i32 @DRAM 13 | // ) 14 | void foo( void *ctxt, const int32_t* a, int32_t* b ) { 15 | *b = *a; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_scope_collision1.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM): 3 | b: i32 @ DRAM 4 | b = 2 5 | a = b 6 | C: 7 | #include "test.h" 8 | 9 | #include 10 | #include 11 | 12 | // foo( 13 | // a : i32 @DRAM 14 | // ) 15 | void foo( void *ctxt, int32_t* a ) { 16 | int32_t b; 17 | b = ((int32_t) 2); 18 | *a = b; 19 | } 20 | 21 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_scope_collision2.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM, b: i32 @ DRAM): 3 | b = 1 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : i32 @DRAM, 12 | // b : i32 @DRAM 13 | // ) 14 | void foo( void *ctxt, const int32_t* a, int32_t* b ) { 15 | *b = ((int32_t) 1); 16 | } 17 | 18 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_scope_nesting.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i8 @ DRAM, b: i8 @ DRAM): 3 | a = b 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : i8 @DRAM, 12 | // b : i8 @DRAM 13 | // ) 14 | void foo( void *ctxt, int8_t* a, const int8_t* b ) { 15 | *a = *b; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_scoping.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i8 @ DRAM): 3 | a = a 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : i8 @DRAM 12 | // ) 13 | void foo( void *ctxt, int8_t* a ) { 14 | *a = *a; 15 | } 16 | 17 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_statement_assignment.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM): 3 | a += 1 4 | a += 2 5 | a += 1 6 | a += 2 7 | C: 8 | #include "test.h" 9 | 10 | #include 11 | #include 12 | 13 | // foo( 14 | // a : i32 @DRAM 15 | // ) 16 | void foo( void *ctxt, int32_t* a ) { 17 | *a += ((int32_t) 1); 18 | *a += ((int32_t) 2); 19 | *a += ((int32_t) 1); 20 | *a += ((int32_t) 2); 21 | } 22 | 23 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_statements.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | 4 | 5 | #include 6 | #include 7 | 8 | 9 | 10 | // foo( 11 | // a : i32 @DRAM 12 | // ) 13 | void foo( void *ctxt, int32_t* a ) { 14 | *a += ((int32_t) 1); 15 | *a += ((int32_t) 1); 16 | for (int_fast32_t i = 0; i < 2; i++) { 17 | *a += ((int32_t) 1); 18 | *a += ((int32_t) 1); 19 | } 20 | } 21 | 22 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_type_params.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def bar1(a: i32 @ DRAM, b: i8 @ DRAM): 3 | c: i32[4] @ DRAM 4 | for i in seq(0, 3): 5 | d: i32 @ DRAM 6 | d = b 7 | c[i + 1] = a + c[i] * d 8 | a = c[3] 9 | def bar2(a: f64 @ DRAM, b: f64 @ DRAM): 10 | c: f64[4] @ DRAM 11 | for i in seq(0, 3): 12 | d: f64 @ DRAM 13 | d = b 14 | c[i + 1] = a + c[i] * d 15 | a = c[3] 16 | C: 17 | #include "test.h" 18 | 19 | #include 20 | #include 21 | 22 | // bar1( 23 | // a : i32 @DRAM, 24 | // b : i8 @DRAM 25 | // ) 26 | void bar1( void *ctxt, int32_t* a, const int8_t* b ) { 27 | int32_t *c = (int32_t*) malloc(4 * sizeof(*c)); 28 | for (int_fast32_t i = 0; i < 3; i++) { 29 | int32_t d; 30 | d = (int32_t)(*b); 31 | c[i + 1] = *a + c[i] * d; 32 | } 33 | *a = c[3]; 34 | free(c); 35 | } 36 | 37 | // bar2( 38 | // a : f64 @DRAM, 39 | // b : f64 @DRAM 40 | // ) 41 | void bar2( void *ctxt, double* a, const double* b ) { 42 | double *c = (double*) malloc(4 * sizeof(*c)); 43 | for (int_fast32_t i = 0; i < 3; i++) { 44 | double d; 45 | d = *b; 46 | c[i + 1] = *a + c[i] * d; 47 | } 48 | *a = c[3]; 49 | free(c); 50 | } 51 | 52 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_type_quote_elision.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i8 @ DRAM, x: i8[2] @ DRAM): 3 | a += x[0] 4 | a += x[1] 5 | C: 6 | #include "test.h" 7 | 8 | #include 9 | #include 10 | 11 | // foo( 12 | // a : i8 @DRAM, 13 | // x : i8[2] @DRAM 14 | // ) 15 | void foo( void *ctxt, int8_t* a, const int8_t* x ) { 16 | *a += x[0]; 17 | *a += x[1]; 18 | } 19 | 20 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_unary_ops.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM): 3 | a = -2 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : i32 @DRAM 12 | // ) 13 | void foo( void *ctxt, int32_t* a ) { 14 | *a = ((int32_t) -2); 15 | } 16 | 17 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_unquote_elision.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i32 @ DRAM): 3 | a = a * 2 4 | C: 5 | #include "test.h" 6 | 7 | #include 8 | #include 9 | 10 | // foo( 11 | // a : i32 @DRAM 12 | // ) 13 | void foo( void *ctxt, int32_t* a ) { 14 | *a = *a * ((int32_t) 2); 15 | } 16 | 17 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_unquote_in_slice.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: [i8][2] @ DRAM): 3 | a[0] += a[1] 4 | def bar(a: i8[10, 10] @ DRAM): 5 | for i in seq(0, 5): 6 | foo(a[i, 2:4]) 7 | C: 8 | #include "test.h" 9 | 10 | #include 11 | #include 12 | 13 | // bar( 14 | // a : i8[10, 10] @DRAM 15 | // ) 16 | void bar( void *ctxt, int8_t* a ) { 17 | for (int_fast32_t i = 0; i < 5; i++) { 18 | foo(ctxt,(struct exo_win_1i8){ &a[(i) * (10) + 2], { 1 } }); 19 | } 20 | } 21 | 22 | // foo( 23 | // a : [i8][2] @DRAM 24 | // ) 25 | void foo( void *ctxt, struct exo_win_1i8 a ) { 26 | a.data[0] += a.data[a.strides[0]]; 27 | } 28 | 29 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_unquote_index_tuple.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: [i8][2, 2] @ DRAM): 3 | a[0, 0] += a[0, 1] 4 | a[1, 0] += a[1, 1] 5 | def bar(a: i8[10, 10, 10] @ DRAM): 6 | for i in seq(0, 7): 7 | foo(a[i, i:i + 2, i + 1:i + 3]) 8 | C: 9 | #include "test.h" 10 | 11 | #include 12 | #include 13 | 14 | // bar( 15 | // a : i8[10, 10, 10] @DRAM 16 | // ) 17 | void bar( void *ctxt, int8_t* a ) { 18 | for (int_fast32_t i = 0; i < 7; i++) { 19 | foo(ctxt,(struct exo_win_2i8){ &a[(i) * (100) + (i) * (10) + i + 1], { 10, 1 } }); 20 | } 21 | } 22 | 23 | // foo( 24 | // a : [i8][2, 2] @DRAM 25 | // ) 26 | void foo( void *ctxt, struct exo_win_2i8 a ) { 27 | a.data[0] += a.data[a.strides[1]]; 28 | a.data[a.strides[0]] += a.data[a.strides[0] + a.strides[1]]; 29 | } 30 | 31 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_unquote_slice_object1.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: [i8][2] @ DRAM): 3 | a[0] += a[1] 4 | def bar(a: i8[10, 10] @ DRAM): 5 | for i in seq(0, 10): 6 | foo(a[i, 1:3]) 7 | for i in seq(0, 10): 8 | foo(a[i, 5:7]) 9 | for i in seq(0, 10): 10 | foo(a[i, 2:4]) 11 | C: 12 | #include "test.h" 13 | 14 | #include 15 | #include 16 | 17 | // bar( 18 | // a : i8[10, 10] @DRAM 19 | // ) 20 | void bar( void *ctxt, int8_t* a ) { 21 | for (int_fast32_t i = 0; i < 10; i++) { 22 | foo(ctxt,(struct exo_win_1i8){ &a[(i) * (10) + 1], { 1 } }); 23 | } 24 | for (int_fast32_t i = 0; i < 10; i++) { 25 | foo(ctxt,(struct exo_win_1i8){ &a[(i) * (10) + 5], { 1 } }); 26 | } 27 | for (int_fast32_t i = 0; i < 10; i++) { 28 | foo(ctxt,(struct exo_win_1i8){ &a[(i) * (10) + 2], { 1 } }); 29 | } 30 | } 31 | 32 | // foo( 33 | // a : [i8][2] @DRAM 34 | // ) 35 | void foo( void *ctxt, struct exo_win_1i8 a ) { 36 | a.data[0] += a.data[a.strides[0]]; 37 | } 38 | 39 | -------------------------------------------------------------------------------- /tests/golden/test_metaprogramming/test_unrolling.txt: -------------------------------------------------------------------------------- 1 | EXO IR: 2 | def foo(a: i8 @ DRAM): 3 | b: i8 @ DRAM 4 | b = 0 5 | b += a 6 | b += a 7 | b += a 8 | b += a 9 | b += a 10 | b += a 11 | b += a 12 | b += a 13 | b += a 14 | b += a 15 | C: 16 | #include "test.h" 17 | 18 | #include 19 | #include 20 | 21 | // foo( 22 | // a : i8 @DRAM 23 | // ) 24 | void foo( void *ctxt, const int8_t* a ) { 25 | int8_t b; 26 | b = ((int8_t) 0); 27 | b += *a; 28 | b += *a; 29 | b += *a; 30 | b += *a; 31 | b += *a; 32 | b += *a; 33 | b += *a; 34 | b += *a; 35 | b += *a; 36 | b += *a; 37 | } 38 | 39 | -------------------------------------------------------------------------------- /tests/golden/test_neon/test_gen_neon_simple_math_scheduling.txt: -------------------------------------------------------------------------------- 1 | def simple_math_neon_sched(n: size, x: R[n] @ DRAM, y: R[n] @ DRAM): 2 | for io in seq(0, n / 4): 3 | xVec: R[4] @ Neon 4 | neon_vld_4xf32(xVec[0:4], x[4 * io:4 + 4 * io]) 5 | yVec: R[4] @ Neon 6 | neon_vld_4xf32(yVec[0:4], y[4 * io:4 + 4 * io]) 7 | xy: R[4] @ Neon 8 | neon_vmul_4xf32(xy[0:4], xVec[0:4], yVec[0:4]) 9 | neon_vmul_4xf32(xVec[0:4], xy[0:4], yVec[0:4]) 10 | neon_vst_4xf32(x[4 * io:4 + 4 * io], xVec[0:4]) 11 | if n % 4 > 0: 12 | for ii in seq(0, n % 4): 13 | x[ii + n / 4 * 14 | 4] = x[ii + n / 4 * 4] * y[ii + n / 4 * 4] * y[ii + n / 4 * 4] -------------------------------------------------------------------------------- /tests/golden/test_neon/test_gen_neon_vfmla.txt: -------------------------------------------------------------------------------- 1 | def vfmla(n: size, C: R[n] @ DRAM, A: R[n] @ DRAM, B: R[n] @ DRAM): 2 | assert n == 4 3 | C_reg: R[4] @ Neon 4 | neon_vld_4xf32(C_reg[0:4], C[0:4]) 5 | A_vec: R[4] @ Neon 6 | neon_vld_4xf32(A_vec[0:4], A[0:4]) 7 | B_vec: R[4] @ Neon 8 | neon_vld_4xf32(B_vec[0:4], B[0:4]) 9 | neon_vfmla_4xf32_4xf32(C_reg[0:4], A_vec[0:4], B_vec[0:4], 0) 10 | neon_vfmla_4xf32_4xf32(C_reg[0:4], A_vec[0:4], B_vec[0:4], 1) 11 | neon_vfmla_4xf32_4xf32(C_reg[0:4], A_vec[0:4], B_vec[0:4], 2) 12 | neon_vfmla_4xf32_4xf32(C_reg[0:4], A_vec[0:4], B_vec[0:4], 3) 13 | neon_vst_4xf32(C[0:4], C_reg[0:4]) -------------------------------------------------------------------------------- /tests/golden/test_neon/test_gen_neon_vfmla_f16.txt: -------------------------------------------------------------------------------- 1 | def vfmla_f16(n: size, C: R[n] @ DRAM, A: R[n] @ DRAM, B: R[n] @ DRAM): 2 | assert n == 8 3 | C_reg: R[8] @ Neon 4 | neon_vld_8xf16(C_reg[0:8], C[0:8]) 5 | A_vec: R[8] @ Neon 6 | neon_vld_8xf16(A_vec[0:8], A[0:8]) 7 | B_vec: R[8] @ Neon 8 | neon_vld_8xf16(B_vec[0:8], B[0:8]) 9 | neon_vfmla_8xf16_8xf16(C_reg[0:8], A_vec[0:8], B_vec[0:8], 0) 10 | neon_vfmla_8xf16_8xf16(C_reg[0:8], A_vec[0:8], B_vec[0:8], 1) 11 | neon_vfmla_8xf16_8xf16(C_reg[0:8], A_vec[0:8], B_vec[0:8], 2) 12 | neon_vfmla_8xf16_8xf16(C_reg[0:8], A_vec[0:8], B_vec[0:8], 3) 13 | neon_vfmla_8xf16_8xf16(C_reg[0:8], A_vec[0:8], B_vec[0:8], 4) 14 | neon_vfmla_8xf16_8xf16(C_reg[0:8], A_vec[0:8], B_vec[0:8], 5) 15 | neon_vfmla_8xf16_8xf16(C_reg[0:8], A_vec[0:8], B_vec[0:8], 6) 16 | neon_vfmla_8xf16_8xf16(C_reg[0:8], A_vec[0:8], B_vec[0:8], 7) 17 | neon_vst_8xf16(C[0:8], C_reg[0:8]) -------------------------------------------------------------------------------- /tests/golden/test_new_eff/test_alloc_success.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, x: R[N, N] @ DRAM): 2 | for j in seq(0, N): 3 | for i in seq(0, N): 4 | tmp: R @ DRAM 5 | tmp = x[i, j] * 2.0 6 | x[i, j] = tmp -------------------------------------------------------------------------------- /tests/golden/test_new_eff/test_delete_config_basic.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, x: R[N] @ DRAM): 2 | for i in seq(0, N): 3 | x[i] = x[i] + 1.0 -------------------------------------------------------------------------------- /tests/golden/test_new_eff/test_delete_config_bc_redundant.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, x: R[N] @ DRAM): 2 | CFG.a = 3 3 | for i in seq(0, N): 4 | if i < CFG.a: 5 | x[i] = x[i] + 1.0 -------------------------------------------------------------------------------- /tests/golden/test_new_eff/test_delete_config_bc_shadow.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, x: R[N] @ DRAM): 2 | CFG.a = 3 3 | for i in seq(0, N): 4 | if i < CFG.a: 5 | x[i] = x[i] + 1.0 -------------------------------------------------------------------------------- /tests/golden/test_new_eff/test_delete_config_subproc_basic.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, x: R[N] @ DRAM): 2 | for i in seq(0, N): 3 | x[i] = x[i] + 1.0 -------------------------------------------------------------------------------- /tests/golden/test_new_eff/test_reorder_loops_4pt_stencil_succeed.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, x: R[N, N] @ DRAM): 2 | for j in seq(0, N): 3 | for i in seq(0, N): 4 | if 0 < i and i < N - 1 and (0 < j and j < N - 1): 5 | x[i, j] += -1.0 / 4.0 * (x[i - 1, j] + x[i + 1, j] + 6 | x[i, j - 1] + x[i, j + 1]) -------------------------------------------------------------------------------- /tests/golden/test_new_eff/test_reorder_loops_requiring_seq.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, x: R[N, N] @ DRAM): 2 | for j in seq(0, N): 3 | for i in seq(0, N): 4 | if i > 0 and j > 0: 5 | x[i, j] += -1.0 / 3.0 * (x[i - 1, j] + x[i - 1, j - 1] + 6 | x[i, j - 1]) -------------------------------------------------------------------------------- /tests/golden/test_new_eff/test_reorder_loops_success.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, x: R[N, N] @ DRAM): 2 | for j in seq(0, N): 3 | for i in seq(0, N): 4 | x[i, j] = x[i, j] * 2.0 -------------------------------------------------------------------------------- /tests/golden/test_parallel/test_pragma_parallel_loop.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | // foo( 7 | // x : i8[10] @DRAM 8 | // ) 9 | void foo( void *ctxt, int8_t* x ) { 10 | #pragma omp parallel for 11 | for (int_fast32_t i = 0; i < 10; i++) { 12 | x[i] = ((int8_t) 1.0); 13 | } 14 | } 15 | 16 | -------------------------------------------------------------------------------- /tests/golden/test_precision/test_good_ui8_prec.txt: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | // Compiler feature macros adapted from Hedley (public domain) 6 | // https://github.com/nemequ/hedley 7 | 8 | #if defined(__has_builtin) 9 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 10 | #else 11 | # define EXO_HAS_BUILTIN(builtin) (0) 12 | #endif 13 | 14 | #if EXO_HAS_BUILTIN(__builtin_assume) 15 | # define EXO_ASSUME(expr) __builtin_assume(expr) 16 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 17 | # define EXO_ASSUME(expr) \ 18 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 19 | #else 20 | # define EXO_ASSUME(expr) ((void)(expr)) 21 | #endif 22 | 23 | 24 | 25 | // hoge( 26 | // n : size, 27 | // x : ui8[n] @DRAM, 28 | // y : ui8 @DRAM 29 | // ) 30 | void hoge( void *ctxt, int_fast32_t n, uint8_t* x, const uint8_t* y ); 31 | 32 | 33 | #include 34 | #include 35 | 36 | // hoge( 37 | // n : size, 38 | // x : ui8[n] @DRAM, 39 | // y : ui8 @DRAM 40 | // ) 41 | void hoge( void *ctxt, int_fast32_t n, uint8_t* x, const uint8_t* y ) { 42 | for (int_fast32_t i = 0; i < n; i++) { 43 | x[i] = *y; 44 | } 45 | } 46 | 47 | -------------------------------------------------------------------------------- /tests/golden/test_reflection/test_show_effect.txt: -------------------------------------------------------------------------------- 1 | Reads: 2 | { A : (i,k) for (j,k) in Z if 3 | 0 <= j and j < M and (0 <= k and k < K) } 4 | { B : (k,j) for (j,k) in Z if 5 | 0 <= j and j < M and (0 <= k and k < K) } 6 | Reduces: 7 | { C : (i,j) for (j,k) in Z if 8 | 0 <= j and j < M and (0 <= k and k < K) } 9 | -------------------------------------------------------------------------------- /tests/golden/test_rvv/test_gen_rvv.txt: -------------------------------------------------------------------------------- 1 | def rvv_test(M: size, C: f32[M] @ DRAM, A: f32[M] @ DRAM, B: f32[M] @ DRAM): 2 | assert M == 8 3 | C_reg: f32[2, 4] @ RVV 4 | for io in seq(0, 2): 5 | rvv_vld_4xf32(C_reg[io + 0, 0:4], C[4 * io + 0:4 * io + 4], 4) 6 | A_vec: R[2, 4] @ RVV 7 | for io in seq(0, 2): 8 | rvv_vld_4xf32(A_vec[io + 0, 0:4], A[4 * io + 0:4 * io + 4], 4) 9 | B_vec: R[2, 4] @ RVV 10 | for io in seq(0, 2): 11 | rvv_vld_4xf32(B_vec[io + 0, 0:4], B[4 * io + 0:4 * io + 4], 4) 12 | for io in seq(0, 2): 13 | rvv_vfmacc_4xf32_4xf32(C_reg[io + 0, 0:4], A_vec[io + 0, 0:4], 14 | B_vec[io + 0, 0:4], 4) 15 | for io in seq(0, 2): 16 | rvv_vst_4xf32(C[4 * io + 0:4 * io + 4], C_reg[io + 0, 0:4], 4) 17 | -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_add_loop.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R @ DRAM): 2 | x = 1.0 3 | for i in seq(0, 5): 4 | x = 2.0 5 | x = 3.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_add_loop1.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: R @ DRAM 3 | for i in seq(0, 10): 4 | x = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_add_loop2.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: R @ DRAM 3 | for i in seq(0, 10): 4 | if i == 0: 5 | x = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_add_loop3.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | x: R @ DRAM 3 | for i in seq(0, n + m): 4 | if i == 0: 5 | x = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_bind_cursor_arg.txt: -------------------------------------------------------------------------------- 1 | def foo(a: R @ DRAM): 2 | const: R @ DRAM 3 | const = 1.0 4 | a = const -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_bind_expr_cse.txt: -------------------------------------------------------------------------------- 1 | def foo(a: i8 @ DRAM, b: i8 @ DRAM, c: i8 @ DRAM): 2 | two_times_a: R @ DRAM 3 | two_times_a = 2.0 * a 4 | b = two_times_a 5 | for i in seq(0, 5): 6 | c += 2.0 * a 7 | a = 2.0 * a -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_bind_expr_cse_2.txt: -------------------------------------------------------------------------------- 1 | def foo(x: i8[5] @ DRAM, y: i8[5] @ DRAM): 2 | two: R @ DRAM 3 | two = 2.0 4 | for i in seq(0, 5): 5 | x[i] = two 6 | for i in seq(0, 5): 7 | y[i] = two -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_bind_expr_diff_indices.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, x: i8[n] @ DRAM, y: i8[n] @ DRAM, z: i8[n] @ DRAM): 2 | for i in seq(0, n - 1): 3 | w: i8[n] @ DRAM 4 | x[i] = x[i] - y[i] 5 | tmp: i8 @ DRAM 6 | tmp = x[i] + y[i] + 1.0 7 | w[i + 1] = tmp 8 | x[i] = y[i] 9 | w[i] = x[i] + y[i] + 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_bind_lhs.txt: -------------------------------------------------------------------------------- 1 | def myfunc_cpu(inp: i32[1, 1, 16] @ DRAM, out: i32[1, 1, 16] @ DRAM): 2 | for ii in seq(0, 1): 3 | for jj in seq(0, 1): 4 | for kk in seq(0, 16): 5 | inp_ram: i32 @ DRAM 6 | inp_ram = inp[ii, jj, kk] 7 | out_ram: i32 @ DRAM 8 | out_ram = out[ii, jj, kk] 9 | out[ii, jj, kk] += out_ram + inp_ram 10 | out[ii, jj, kk] = out[ii, jj, kk] * inp_ram -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_bool_partial_eval.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | tmp: i8[n] @ DRAM 3 | for i in seq(0, n): 4 | if False == True: 5 | tmp[i] = A[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_commute.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R[3] @ DRAM, y: R[3] @ DRAM, z: R @ DRAM): 2 | z = y[2] * x[0] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_commute3.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R[3] @ DRAM, y: R[3] @ DRAM, z: R @ DRAM): 2 | z = (x[1] + y[1] + y[2]) * (x[0] + y[0]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | assert n > 3 3 | for i in seq(0, 3): 4 | x: R @ DRAM 5 | x = 0.0 6 | for i in seq(3, n): 7 | x: R @ DRAM 8 | x = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop_at_hi.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | for i in seq(3, 5): 3 | x: f32 @ DRAM 4 | for i in seq(5, 5): 5 | x: f32 @ DRAM -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop_at_lo.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | for i in seq(3, 3): 3 | x: f32 @ DRAM 4 | for i in seq(3, 5): 5 | x: f32 @ DRAM -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop_by_expr.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: f32[n] @ DRAM): 2 | assert n >= 1 3 | for i in seq(0, n / 2): 4 | x[i] = 0.0 5 | for i in seq(n / 2, n): 6 | x[i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop_by_expr1.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: f32[n] @ DRAM): 2 | assert n >= 1 3 | for i in seq(0, n - 1): 4 | x[i] = 0.0 5 | for i in seq(n - 1, n): 6 | x[i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop_by_expr2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | assert n > m 3 | x: R[n] @ DRAM 4 | for i in seq(m, 1 + m): 5 | x[i] = 0.0 6 | for i in seq(1 + m, n): 7 | x[i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop_nonzero_lo.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | assert n >= 5 3 | x: R[n] @ DRAM 4 | for i in seq(3, 5): 5 | x[i] = 0.0 6 | for i in seq(5, n): 7 | x[i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop_nonzero_lo2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | assert m >= 5 3 | assert m <= 8 4 | assert n >= 9 5 | assert n > 1 + m 6 | x: R[n] @ DRAM 7 | for i in seq(m, 9): 8 | x[i] = 0.0 9 | for i in seq(9, n): 10 | x[i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_loop_syrk.txt: -------------------------------------------------------------------------------- 1 | def SYRK(M: size, K: size, A: [f32][M, K] @ DRAM, A_t: [f32][M, K] @ DRAM, 2 | C: [f32][M, M] @ DRAM): 3 | assert M >= 1 4 | assert K >= 1 5 | assert stride(A, 1) == 1 6 | assert stride(A_t, 1) == 1 7 | assert stride(C, 1) == 1 8 | for io in seq(0, M / 4): 9 | for ii in seq(0, 4): 10 | for j in seq(0, 1): 11 | for k in seq(0, K): 12 | C[ii + 4 * io, j] += A[ii + 4 * io, k] * A_t[j, k] 13 | for j in seq(0, ii + 4 * io): 14 | for k in seq(0, K): 15 | C[ii + 4 * io, 1 + j] += A[ii + 4 * io, k] * A_t[1 + j, k] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_cut_then_shift_loop.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: f32[20] @ DRAM): 2 | assert n >= m 3 | for i in seq(5, 13): 4 | x[-3 + i] = 0.0 5 | for i in seq(0, 10): 6 | x[10 + i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_delete_buffer.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_delete_pass.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R @ DRAM): 2 | x = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_delete_pass_1.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R @ DRAM): 2 | for i in seq(0, 16): 3 | x = 1.0 4 | x = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_dim_1.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, A: R[n + m + 12] @ DRAM): 2 | x: R[n, 3, 4, m] @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, 12): 5 | for k in seq(0, m): 6 | x[i, j / 4, j % 4, k] = A[i + j + k] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_dim_2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, A: R[n + m + 12] @ DRAM): 2 | x: R[n, 3 * m, 4, m] @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, 12): 5 | for k in seq(0, m): 6 | x[i, j / 4, j % 4, k] = A[i + j + k] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_dim_3.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | x: R[n, 1, (7 + m) / 8 * 8 / 8, 8, 1, m, 1] @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | for k in seq(0, m): 6 | x[i, 0, j / 8, j % 8, 0, k, 0] = 2.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_loop_by_1_cut.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size): 2 | for io in seq(0, n / 1): 3 | for ii in seq(0, 1): 4 | pass 5 | for ii in seq(0, n % 1): 6 | pass 7 | def bar(n: size): 8 | for io in seq(0, n): 9 | for ii in seq(0, 1): 10 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_loop_by_1_guard.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size): 2 | for io in seq(0, (n + 0) / 1): 3 | for ii in seq(0, 1): 4 | if 1 * io + ii < n: 5 | pass 6 | def bar(n: size): 7 | for io in seq(0, n): 8 | for ii in seq(0, 1): 9 | if ii + io < n: 10 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_loop_cut_and_guard.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | tmp: i8[n] @ DRAM 3 | for io in seq(0, n / 4): 4 | for ii in seq(0, 4): 5 | tmp[4 * io + ii] = A[4 * io + ii] 6 | foo(tmp[4 * io + ii:4 * io + ii + 1]) 7 | if n % 4 > 0: 8 | for ii in seq(0, n % 4): 9 | tmp[ii + n / 4 * 4] = A[ii + n / 4 * 4] 10 | foo(tmp[ii + n / 4 * 4:ii + n / 4 * 4 + 1]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_loop_perfect.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, A: i8[n] @ DRAM): 2 | assert n % 4 == 0 3 | for io in seq(0, n / 4): 4 | for ii in seq(0, 4): 5 | A[4 * io + ii] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_loop_perfect2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, A: i8[n] @ DRAM): 2 | assert n % 4 == 0 3 | for io in seq(0, n / 4): 4 | tile: i8[4] @ DRAM 5 | for ii in seq(0, 4): 6 | tile[ii] = 0.2 7 | for i0 in seq(0, 4): 8 | A[i0 + 4 * io] = tile[i0] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_loop_perfect3.txt: -------------------------------------------------------------------------------- 1 | def foo(m: size, n: size, A: R[m, n] @ DRAM): 2 | assert n % 4 == 0 and m % 8 == 0 3 | for io in seq(0, m / 8): 4 | for ii in seq(0, 8): 5 | for jo in seq(0, n / 4): 6 | for ji in seq(0, 4): 7 | A[ii + 8 * io, ji + 4 * jo] = 0.2 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_divide_with_recompute.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, A: i8[n + 3] @ DRAM): 2 | assert n % 4 == 0 3 | for io in seq(0, n / 4): 4 | for ii in seq(0, 7): 5 | A[ii + 4 * io] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_double_fission.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, a: f32[N] @ DRAM, b: f32[N] @ DRAM, out: f32[N] @ DRAM): 2 | res: f32[N + 1] @ DRAM 3 | for i in seq(0, N): 4 | res[i] = 0.0 5 | for i in seq(0, N): 6 | res[i] += a[i] * b[i] 7 | for i in seq(0, N): 8 | out[i] = res[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_eliminate_dead_code.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: f32 @ DRAM 3 | for i in seq(0, 8): 4 | x = 0.0 5 | a: R @ DRAM 6 | a = x -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_eliminate_dead_code2.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: f32 @ DRAM 3 | for i in seq(0, 8): 4 | b: R @ DRAM 5 | b = x -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_eliminate_dead_code3.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: f32 @ DRAM 3 | for i in seq(0, 8): 4 | x = 0.0 5 | a: R @ DRAM 6 | a = x -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_eliminate_dead_code4.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: f32 @ DRAM 3 | for i in seq(0, 8): 4 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_eliminate_dead_code7.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | if 0 < n: 3 | for i in seq(0, n): 4 | x: f32 @ DRAM 5 | else: 6 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_eliminate_dead_code8.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_eliminate_dead_code9.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_expand_dim.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: i8 @ DRAM): 2 | for i in seq(0, n): 3 | for j in seq(0, m): 4 | a: i8[n] @ DRAM 5 | x = a[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_expand_dim3.txt: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | // Compiler feature macros adapted from Hedley (public domain) 6 | // https://github.com/nemequ/hedley 7 | 8 | #if defined(__has_builtin) 9 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 10 | #else 11 | # define EXO_HAS_BUILTIN(builtin) (0) 12 | #endif 13 | 14 | #if EXO_HAS_BUILTIN(__builtin_assume) 15 | # define EXO_ASSUME(expr) __builtin_assume(expr) 16 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 17 | # define EXO_ASSUME(expr) \ 18 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 19 | #else 20 | # define EXO_ASSUME(expr) ((void)(expr)) 21 | #endif 22 | 23 | 24 | 25 | // foo( 26 | // n : size, 27 | // m : size, 28 | // x : i8 @DRAM 29 | // ) 30 | void foo( void *ctxt, int_fast32_t n, int_fast32_t m, int8_t* x ); 31 | 32 | 33 | #include 34 | #include 35 | 36 | // foo( 37 | // n : size, 38 | // m : size, 39 | // x : i8 @DRAM 40 | // ) 41 | void foo( void *ctxt, int_fast32_t n, int_fast32_t m, int8_t* x ) { 42 | for (int_fast32_t i = 0; i < n; i++) { 43 | for (int_fast32_t j = 0; j < m; j++) { 44 | ; // NO-OP 45 | } 46 | } 47 | for (int_fast32_t i = 0; i < n; i++) { 48 | int8_t *a = (int8_t*) malloc(n * sizeof(*a)); 49 | for (int_fast32_t j = 0; j < m; j++) { 50 | *x = a[i]; 51 | } 52 | free(a); 53 | } 54 | for (int_fast32_t i = 0; i < n; i++) { 55 | for (int_fast32_t j = 0; j < m; j++) { 56 | ; // NO-OP 57 | } 58 | } 59 | } 60 | 61 | -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_expand_dim4.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: i8 @ DRAM): 2 | for i in seq(0, n): 3 | for j in seq(0, m): 4 | pass 5 | for q in seq(0, 30): 6 | for i in seq(0, n): 7 | for j in seq(0, m): 8 | a: i8[n] @ DRAM 9 | x = a[i] 10 | for i in seq(0, n): 11 | for j in seq(0, m): 12 | pass 13 | def foo(n: size, m: size, x: i8 @ DRAM): 14 | for i in seq(0, n): 15 | for j in seq(0, m): 16 | pass 17 | for q in seq(0, 30): 18 | for i in seq(0, n): 19 | for j in seq(0, m): 20 | a: i8[40 + 1] @ DRAM 21 | x = a[10] 22 | for i in seq(0, n): 23 | for j in seq(0, m): 24 | pass 25 | def foo(n: size, m: size, x: i8 @ DRAM): 26 | for i in seq(0, n): 27 | for j in seq(0, m): 28 | pass 29 | for q in seq(0, 30): 30 | for i in seq(0, n): 31 | for j in seq(0, m): 32 | a: i8[n + m] @ DRAM 33 | x = a[i] 34 | for i in seq(0, n): 35 | for j in seq(0, m): 36 | pass 37 | def foo(n: size, m: size, x: i8 @ DRAM): 38 | for i in seq(0, n): 39 | for j in seq(0, m): 40 | pass 41 | for q in seq(0, 30): 42 | for i in seq(0, n): 43 | for j in seq(0, m): 44 | a: i8[n] @ DRAM 45 | x = a[n - 1] 46 | for i in seq(0, n): 47 | for j in seq(0, m): 48 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_expand_dim5.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: i8 @ DRAM): 2 | for i in seq(0, n): 3 | a: i8[n] @ DRAM 4 | a[i] = x -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_expand_dim6.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: i8 @ DRAM): 2 | for i in seq(0, n): 3 | for j in seq(0, m): 4 | a: i8[n, m] @ DRAM 5 | a[i, j] = a[i, j] + 1.0 6 | a[i, j] += 1.0 7 | bar(m, a[i, 0:m]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_extract_subproc.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: R @ DRAM 3 | y: R[8] @ DRAM 4 | for j in seq(0, 8): 5 | x = 0.0 6 | fooooo(x, y, j) 7 | def fooooo(x: R @ DRAM, y: R[8] @ DRAM, j: index): 8 | for i in seq(0, 8): 9 | x += y[j] * 2.0 10 | def fooooo(x: R @ DRAM, y: R[8] @ DRAM, j: index): 11 | assert 0 <= j 12 | assert j < 8 13 | for i in seq(0, 8): 14 | x += y[j] * 2.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_extract_subproc2.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, M: size, K: size, x: R[N, K + M] @ DRAM): 2 | assert N >= 8 3 | fooooo(N, M, K, x) 4 | def fooooo(N: size, M: size, K: size, x: R[N, K + M] @ DRAM): 5 | assert N >= 8 6 | for i in seq(0, 8): 7 | x[i, 0] += 2.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_extract_subproc3.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, M: size, K: size, x: R[N, K + M] @ DRAM): 2 | assert N >= 8 3 | assert M >= 2 4 | if N < 10 and M < 4: 5 | foo_if(N, M, K, x) 6 | else: 7 | foo_else(N, M, K, x) 8 | def foo_if(N: size, M: size, K: size, x: R[N, K + M] @ DRAM): 9 | assert N >= 8 10 | assert M >= 2 11 | assert (N < 10 and M < 4) == True 12 | for i in seq(0, 8): 13 | x[i, 0] += 2.0 14 | def foo_else(N: size, M: size, K: size, x: R[N, K + M] @ DRAM): 15 | assert N >= 8 16 | assert M >= 2 17 | assert (N < 10 and M < 4) == False 18 | for i in seq(0, 8): 19 | x[i, 0] += 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_extract_subproc4.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, M: size, K: size, x: R[N, K + M] @ DRAM): 2 | assert N >= 8 3 | fooooo(N, M, K, x) 4 | def fooooo(N: size, M: size, K: size, x: R[N, K + M] @ DRAM): 5 | assert N >= 8 6 | x[0, 0] = 0.0 7 | for i in seq(0, 8): 8 | x[i, 0] += 2.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_extract_subproc5.txt: -------------------------------------------------------------------------------- 1 | def foo(x: f32[8] @ DRAM, y: f32[8] @ DRAM): 2 | reg: f32[8] @ AVX2 3 | fooooo(x, y, reg) 4 | def fooooo(x: f32[8] @ DRAM, y: f32[8] @ DRAM, reg: f32[8] @ AVX2): 5 | for i in seq(0, 8): 6 | reg[i] = x[i] 7 | for i in seq(0, 8): 8 | y[i] = reg[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_extract_subproc6.txt: -------------------------------------------------------------------------------- 1 | def foo(x: [f32][8] @ DRAM, y: [f32][8] @ DRAM): 2 | assert stride(x, 0) == 1 3 | assert stride(y, 0) == 1 4 | reg: f32[8] @ AVX2 5 | fooooo(x, reg) 6 | def fooooo(x: [f32][8] @ DRAM, reg: f32[8] @ AVX2): 7 | assert stride(x, 0) == 1 8 | for i in seq(0, 8): 9 | reg[i] = x[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_extract_subproc7.txt: -------------------------------------------------------------------------------- 1 | def gemv(m: size, n: size, alpha: R @ DRAM, beta: R @ DRAM, 2 | A: [R][m, n] @ DRAM, x: [R][n] @ DRAM, y: [R][m] @ DRAM): 3 | assert stride(A, 1) == 1 4 | for i in seq(0, m): 5 | y[i] = y[i] * beta 6 | for j in seq(0, n): 7 | fooooo(m, n, alpha, A, x, y, j) 8 | def fooooo(m: size, n: size, alpha: R @ DRAM, A: [R][m, n] @ DRAM, 9 | x: [R][n] @ DRAM, y: [R][m] @ DRAM, j: index): 10 | assert stride(A, 1) == 1 11 | assert 0 <= j 12 | assert j < n 13 | for i in seq(0, m): 14 | y[i] += alpha * x[j] * A[i, j] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fission.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size): 2 | x: f32 @ DRAM 3 | x = 0.0 4 | y: f32 @ DRAM 5 | y = 1.1 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fission_after_simple.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | for i in seq(0, n): 3 | for j in seq(0, m): 4 | x: f32 @ DRAM 5 | x = 0.0 6 | for i in seq(0, n): 7 | for j in seq(0, m): 8 | y: f32 @ DRAM 9 | y = 1.1 10 | def bar(n: size, m: size): 11 | for i in seq(0, n): 12 | for j in seq(0, m): 13 | x: f32 @ DRAM 14 | x = 0.0 15 | for i in seq(0, n): 16 | for j in seq(0, m): 17 | y: f32 @ DRAM 18 | y = 1.1 19 | for k in seq(0, 30): 20 | for l in seq(0, 100): 21 | x: i8 @ DRAM 22 | x = 4.0 23 | y: f32 @ DRAM 24 | y = 1.1 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fold_buffer_blur.txt: -------------------------------------------------------------------------------- 1 | def blur(H: size, W: size, inp: i8[H + 2, W] @ DRAM, out: i8[H, W] @ DRAM): 2 | assert H % 32 == 0 3 | assert W > 32 4 | for io in seq(0, H / 32): 5 | blur_x: i8[3, W] @ DRAM 6 | for ii in seq(0, 2): 7 | for j in seq(0, -2 + W): 8 | blur_x[ii, j] = inp[ii + 32 * io, 9 | j] + inp[ii + 32 * io, 10 | 1 + j] + inp[ii + 32 * io, 2 + j] 11 | for ii in seq(0, 32): 12 | for j in seq(0, -2 + W): 13 | blur_x[(2 + ii) % 3, 14 | j] = inp[ii + 32 * io, 15 | j] + inp[ii + 32 * io, 16 | 1 + j] + inp[ii + 32 * io, 2 + j] 17 | for j in seq(0, -2 + W): 18 | out[ii + 32 * io, j] = blur_x[ii % 3, j] + blur_x[ 19 | (1 + ii) % 3, j] + blur_x[(2 + ii) % 3, j] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fold_buffer_if_stmt.txt: -------------------------------------------------------------------------------- 1 | def foo(condition: bool @ DRAM): 2 | x: i8[3] @ DRAM 3 | x[2] = 0.0 4 | if condition: 5 | x[1] = 0.0 6 | x[2] = 0.0 7 | else: 8 | for i in seq(2, 5): 9 | x[i % 3] = 1.0 10 | x[(-1 + i) % 3] = 2.0 11 | x[-2 + i] = 2.0 12 | x[0] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fold_buffer_loop_in_context.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size): 2 | assert N > 2 3 | x: i8[3] @ DRAM 4 | x[2] = 0.0 5 | for i in seq(0, N / 2): 6 | x[2 * i % 3] = 1.0 7 | x[(1 + 2 * i) % 3] = 2.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fold_buffer_loop_simple.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size): 2 | assert N > 4 3 | x: i8[3] @ DRAM 4 | for i in seq(0, -4 + N): 5 | for j in seq(i, 4 + i): 6 | x[j % 3] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fold_buffer_sequential_stmts.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: i8[3] @ DRAM 3 | x[0] = 0.0 4 | x[2] = 0.0 5 | x[0] = 0.0 6 | x[2] = 0.0 7 | x[1] = 0.0 8 | x[2] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fold_buffer_within_stmt.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: i8[4] @ DRAM 3 | x[1] = x[3] + x[0] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fold_into_reduce_1.txt: -------------------------------------------------------------------------------- 1 | def bar(result: f32 @ DRAM): 2 | result += 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fold_into_reduce_2.txt: -------------------------------------------------------------------------------- 1 | def bar(m: size, n: size, a: f32[m, n] @ DRAM, x: f32 @ DRAM): 2 | for i in seq(0, m): 3 | for j in seq(0, n): 4 | a[i, j] += x * x -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_formatted_expr_1.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, arr: R[n] @ DRAM): 2 | for i in seq(0, n): 3 | tmp: R[n + 1] @ DRAM 4 | tmp[i] = 1.0 5 | arr[i] = tmp[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_formatted_expr_2.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size, arr: R[n, m] @ DRAM): 2 | for i in seq(0, n): 3 | for j in seq(0, m): 4 | tmp: R[(n + 1) * (1 + m)] @ DRAM 5 | tmp[i * m + j] = 1.0 6 | arr[i, j] = tmp[i * m + j] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_formatted_expr_3.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: f32[n] @ DRAM): 2 | assert n >= 10 3 | for i in seq(0, -3 + n): 4 | x[i] = 0.0 5 | for i in seq(-3 + n, -2 + n): 6 | x[i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fuse_if.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R @ DRAM, a: index, b: index): 2 | if a == b: 3 | x += 1.0 4 | x += 2.0 5 | else: 6 | x += 3.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fuse_loop.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM): 2 | y: R[n] @ DRAM 3 | for i in seq(0, n): 4 | y[i] = x[i] 5 | x[i] = y[i] + 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fuse_loop2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM): 2 | assert n > 3 3 | y: R[n] @ DRAM 4 | for i in seq(3, n): 5 | y[i] = x[i] 6 | x[i] = y[i] + 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_fuse_loop_commute_config.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM): 2 | y: R[n] @ DRAM 3 | for i in seq(0, n): 4 | CFG.j = 0 5 | CFG.j = 0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_inline_assign.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, y: i8[n] @ DRAM): 2 | for i in seq(0, n): 3 | x: i8[5] @ DRAM 4 | y[i] = 1.0 + x[2] 5 | a: i8 @ DRAM 6 | a = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_inline_assign_scalar.txt: -------------------------------------------------------------------------------- 1 | def foo(b: f32 @ DRAM): 2 | a: f32 @ DRAM 3 | b = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_inline_window.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: R[n, m] @ DRAM): 2 | assert n > 4 3 | assert m > 4 4 | for i in seq(0, n - 4): 5 | for j in seq(0, m - 4): 6 | a: R @ DRAM 7 | a = x[i, j] * x[i + 2, j + 1] 8 | x[i + 2, j + 1] = a + x[i + 1, j + 1] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_inline_window2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, k: size, x: R[n, m, k, 10] @ DRAM): 2 | x[0, 0, 0, 0] = 0.0 3 | bar(stride(x, 2)) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_inline_window3.txt: -------------------------------------------------------------------------------- 1 | def memset(n: size, x: [R][n] @ DRAM): 2 | assert n % 16 == 0 3 | res: R @ DRAM 4 | for io in seq(0, n / 16): 5 | inner_memset(x[16 * io:16 + 16 * io]) 6 | res += x[16 * io] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_insert_noop_call.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: i8[n] @ DRAM, locality_hint: size): 2 | assert locality_hint >= 0 3 | assert locality_hint < 8 4 | prefetch(x[1:2], locality_hint) 5 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_join_loops_body_match.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: i8[n + 1] @ DRAM): 2 | for i in seq(0, n + 1): 3 | x[i] = 0.0 4 | x[i] += -(1.0 + x[i]) 5 | for j in seq(0, 1): 6 | if i == j: 7 | pass 8 | a: i8[4, 2] @ DRAM 9 | y = a[1:3, 1:2] 10 | do_nothing(y) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_join_loops_equiv_but_diff_bounds.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: i8[4] @ DRAM): 2 | assert n % 4 == 2 3 | for i in seq(0, 4): 4 | x[i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_left_reassociate_expr_1.txt: -------------------------------------------------------------------------------- 1 | def foo(a: f32 @ DRAM, b: f32 @ DRAM, c: f32 @ DRAM): 2 | b = c + (a + b) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_left_reassociate_expr_2.txt: -------------------------------------------------------------------------------- 1 | def foo(a: f32 @ DRAM, b: f32 @ DRAM, c: f32 @ DRAM): 2 | b = c * (a * b * b) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift.txt: -------------------------------------------------------------------------------- 1 | def bar(A: i8[16, 10] @ DRAM): 2 | a: i8[16, 20] @ DRAM 3 | for i in seq(0, 10): 4 | for k in seq(0, 16): 5 | a[k, i] = A[k, i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_alloc_simple.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | tmp_a: i8 @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, n): 5 | tmp_a = A[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_alloc_simple2.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | tmp_a: i8 @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, n): 5 | tmp_a = A[i] 6 | for i in seq(0, n): 7 | for j in seq(0, n): 8 | tmp_a_1: i8 @ DRAM 9 | tmp_a_1 = A[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_alloc_simple3.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | tmp_a: i8 @ DRAM 3 | for k in seq(0, n): 4 | for i in seq(0, n): 5 | for j in seq(0, n): 6 | tmp_a = A[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_alloc_simple_empty_body.txt: -------------------------------------------------------------------------------- 1 | def bar(): 2 | tmp: i8 @ DRAM 3 | for i in seq(0, 4): 4 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_halfway.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM, i: index): 2 | for j in seq(0, n): 3 | if i < 10: 4 | if n > 20: 5 | x[j] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_in_else_branch_of_parent.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM): 2 | if n < 20: 3 | if 10 < n: 4 | x[0] = 1.0 5 | else: 6 | x[0] = 2.0 7 | else: 8 | if 10 < n: 9 | x[0] = 1.0 10 | else: 11 | x[0] = 3.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_in_full_nest.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM): 2 | if n < 20: 3 | if 10 < n: 4 | if n < 15: 5 | x[0] = 1.0 6 | else: 7 | x[0] = 2.0 8 | else: 9 | x[0] = 3.0 10 | else: 11 | if 10 < n: 12 | if n < 15: 13 | x[0] = 1.0 14 | else: 15 | x[0] = 2.0 16 | else: 17 | x[0] = 4.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_middle.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM, i: index): 2 | if n > 20: 3 | for j in seq(0, n): 4 | if i < 10: 5 | x[j] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_past_for.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM, i: index): 2 | if i < 10: 3 | for j in seq(0, n): 4 | x[j] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_past_if.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM, i: index): 2 | assert i > 0 3 | if i < 10: 4 | if i < n: 5 | x[i] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_past_if_then_for.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM, i: index): 2 | if i < 10: 3 | for j in seq(0, n): 4 | if n > 20: 5 | x[j] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_with_else_past_if.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM, i: size): 2 | assert n > 10 3 | if n > 20: 4 | if i < 10: 5 | x[i] = 1.0 6 | else: 7 | if i < 10: 8 | x[i] = 2.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_with_else_past_if_with_else.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM, i: size): 2 | assert n > 10 3 | assert i < n 4 | if n > 20: 5 | if i < 10: 6 | x[i] = 1.0 7 | else: 8 | x[i] = 3.0 9 | else: 10 | if i < 10: 11 | x[i] = 2.0 12 | else: 13 | x[i] = 3.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_with_pass_body.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | if n < 20: 3 | if 10 < n: 4 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_if_with_pass_body_and_else.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | if n < 20: 3 | if 10 < n: 4 | pass 5 | else: 6 | if 10 < n: 7 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_reduce_constant_1.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: R @ DRAM 3 | x = 0.0 4 | for i in seq(0, 8): 5 | x += 2.0 6 | x = 3.0 * x -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_reduce_constant_2.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: R[2] @ DRAM 3 | y: R[8] @ DRAM 4 | x[0] = 0.0 5 | for i in seq(0, 8): 6 | for j in seq(0, 8): 7 | y[i] += y[i] * 2.0 8 | for k in seq(0, 8): 9 | x[0] += y[i] 10 | x[0] += y[i] 11 | x[0] = 3.0 * x[0] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_reduce_constant_3.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: R @ DRAM 3 | y: R[8] @ DRAM 4 | for j in seq(0, 8): 5 | x = 0.0 6 | for i in seq(0, 8): 7 | x += 2.0 8 | x = y[j] * x -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_lift_scope.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n, n] @ DRAM): 2 | for j in seq(0, n): 3 | for i in seq(0, n): 4 | if j < 10: 5 | x[i, j] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_mem_aware_replace.txt: -------------------------------------------------------------------------------- 1 | def bar(src: f32[8] @ DRAM): 2 | dst: f32[8] @ AVX2 3 | mm256_loadu_ps(dst[0:8], src[0:8]) 4 | mm256_storeu_ps(src[0:8], dst[0:8]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_merge_writes_all_4_cases.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[4] @ DRAM, y: R[4] @ DRAM): 2 | for i in seq(0, 10): 3 | if i < 5: 4 | tmp: R[4] @ DRAM 5 | tmp[0] = y[0] 6 | tmp[1] = x[1] + y[1] 7 | tmp[2] = y[2] 8 | tmp[3] += x[3] + y[3] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_merge_writes_array_indexing.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[3] @ DRAM, y: R[3] @ DRAM, z: R @ DRAM): 2 | for i in seq(0, 3): 3 | for j in seq(0, 3): 4 | if i < 2: 5 | tmp: R[4, 4] @ DRAM 6 | tmp[i + j, j] = x[i] + y[j] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_merge_writes_consecutively.txt: -------------------------------------------------------------------------------- 1 | def bar(w: R @ DRAM, x: R @ DRAM, y: R @ DRAM, z: R @ DRAM): 2 | z = w + x + y 3 | w = x -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_merge_writes_type_check.txt: -------------------------------------------------------------------------------- 1 | def bar(y: f32 @ DRAM): 2 | x: f32 @ DRAM 3 | x = 0.0 + y -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_mult_dim_1.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, A: R[n + m + 12] @ DRAM): 2 | x: R[4 * n, m] @ DRAM 3 | for i in seq(0, n): 4 | for j in seq(0, m): 5 | for k in seq(0, 4): 6 | x[4 * i + k, j] = A[i + j + k] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_new_expr_multi_vars.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, arr: R[n] @ DRAM): 2 | for i in seq(0, n): 3 | tmp: R[n] @ DRAM 4 | tmp[i] = 1.0 5 | arr[i] = tmp[i] 6 | i: R @ DRAM 7 | i = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_old_lift_alloc_config.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | assert n > 4 3 | CFG.cfg = A[0] 4 | win_stmt = A[0:4] 5 | tmp_a: i8[n + 1] @ DRAM 6 | for i in seq(0, n): 7 | tmp_a[i] = A[i] 8 | A[0] = CFG.cfg -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_parallelize_loop.txt: -------------------------------------------------------------------------------- 1 | def foo(A: i8[10] @ DRAM): 2 | for i in par(0, 10): 3 | A[i] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_pattern_matching_id_in_scheduling_ops.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, ret: i8 @ DRAM): 2 | reg: i8[n] @ DRAM 3 | for i in seq(0, n): 4 | reg_1: R[n] @ DRAM 5 | reg_1[i] = 1.0 6 | ret += reg[i] + reg_1[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_product_loop.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | x: R[n, 30] @ DRAM 3 | for ij in seq(0, n * 30): 4 | x[ij / 30, ij % 30] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_product_loop2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n, 30] @ DRAM): 2 | for ij in seq(0, n * 30): 3 | x[ij / 30, ij % 30] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_product_loop4.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: R[n] @ DRAM): 2 | for ij in seq(0, n * 30): 3 | x[ij / 30] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_product_loop5.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: R[n, 100] @ DRAM): 2 | assert m < n 3 | x2 = x[0:m, 0:30] 4 | for ij in seq(0, m * 30): 5 | x2[ij / 30, ij % 30] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_rearrange_dim.txt: -------------------------------------------------------------------------------- 1 | def foo(N: size, M: size, K: size, x: i8[N, M, K] @ DRAM): 2 | a: i8[M, K, N] @ DRAM 3 | for n in seq(0, N): 4 | for m in seq(0, M): 5 | for k in seq(0, K): 6 | a[m, k, n] = x[n, m, k] 7 | def bar(N: size, M: size, K: size, x: i8[N, M, K] @ DRAM): 8 | a: i8[M, N, K] @ DRAM 9 | for n in seq(0, N): 10 | for m in seq(0, M): 11 | for k in seq(0, K): 12 | a[m, n, k] = x[n, m, k] 13 | a_1: i8[K, M, N] @ DRAM 14 | for n in seq(0, N): 15 | for m in seq(0, M): 16 | for k in seq(0, K): 17 | a_1[k, m, n] = x[n, m, k] 18 | def baz(N: size, M: size, x: i8[N, M] @ DRAM): 19 | a: i8[M, N] @ DRAM 20 | for n in seq(0, N): 21 | for m in seq(0, M): 22 | a[m, n] = x[n, m] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_rearrange_dim_2.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | a: i8[10, 10] @ DRAM 3 | for i in seq(0, 10): 4 | for j in seq(0, 10): 5 | a[j, i] = a[i, j] 6 | bar(stride(a, 0)) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_reassociate_then_fold.txt: -------------------------------------------------------------------------------- 1 | def foo(a: f32 @ DRAM, b: f32 @ DRAM, c: f32 @ DRAM): 2 | b += a + c -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_remove_loop.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: i8 @ DRAM): 2 | a: i8 @ DRAM 3 | for j in seq(0, m): 4 | x = a 5 | def bar(n: size, m: size, x: i8 @ DRAM): 6 | a: i8 @ DRAM 7 | for j in seq(0, m): 8 | x = a 9 | for i in seq(0, n): 10 | for j in seq(0, m): 11 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_remove_loop_deterministic.txt: -------------------------------------------------------------------------------- 1 | def foo(M: size, N: size, K: size, A: f32[M, N] @ DRAM): 2 | if K / 4 > 0: 3 | for i in seq(0, M): 4 | for j in seq(0, N): 5 | A[i, j] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_reorder_stmts.txt: -------------------------------------------------------------------------------- 1 | def bar(g: R[100] @ DRAM): 2 | f: R[101] @ DRAM 3 | f[100] = 1.0 4 | for i in seq(0, 100): 5 | f[i] = 1.0 6 | for i in seq(0, 100): 7 | g[i] = f[i] + f[i + 1] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_replace_all_arch.txt: -------------------------------------------------------------------------------- 1 | def bar(src: f32[8] @ DRAM): 2 | dst: f32[8] @ AVX2 3 | mm256_loadu_ps(dst[0:8], src[0:8]) 4 | mm256_storeu_ps(src[0:8], dst[0:8]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_replace_all_length_mismatch.txt: -------------------------------------------------------------------------------- 1 | def foo(x: i8 @ DRAM): 2 | bar(x) 3 | x = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_replace_all_unambiguous.txt: -------------------------------------------------------------------------------- 1 | def bar(src: f32[8] @ DRAM): 2 | dst: f32[8] @ AVX2 3 | mm256_loadu_ps(dst[0:8], src[0:8]) 4 | mm256_storeu_ps(src[0:8], dst[0:8]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_replace_once.txt: -------------------------------------------------------------------------------- 1 | def bar(src: f32[8] @ DRAM): 2 | dst: f32[8] @ AVX2 3 | mm256_loadu_ps(dst[0:8], src[0:8]) 4 | for i in seq(0, 8): 5 | src[i] = dst[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_resize_dim.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: i8[19] @ DRAM 3 | for i in seq(1, 9): 4 | x[-1 + i] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_resize_dim_2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | assert n > 4 3 | x: i8[-3 + n] @ DRAM 4 | for i in seq(2, -1 + n): 5 | x[-2 + i] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_resize_dim_3.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | x: i8[2] @ DRAM 3 | for i in seq(n + 1, n + 3): 4 | x[i - (n + 1)] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_resize_dim_4.txt: -------------------------------------------------------------------------------- 1 | def foo1(): 2 | x: i8[6] @ DRAM 3 | for i in seq(3, 6): 4 | bar(x[i - 2:i + 3 - 2]) 5 | def foo2(): 6 | x: i8[15, 10] @ DRAM 7 | for i in seq(3, 6): 8 | bar(x[i - 2, i:i + 3]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_resize_dim_5.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | x: i8[10] @ DRAM 3 | for i in seq(1, 8): 4 | x[i - -1] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_reuse_buffer.txt: -------------------------------------------------------------------------------- 1 | def foo(a: f32 @ DRAM, b: f32 @ DRAM): 2 | aa: f32 @ DRAM 3 | bb: f32 @ DRAM 4 | aa = a 5 | bb = b 6 | bb = aa + bb 7 | b = bb -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_reuse_buffer2.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | bb: f32 @ DRAM 3 | bar(bb) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_rewrite_expr.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | assert n % 4 == 2 3 | for i in seq(0, 6): 4 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_rewrite_expr_2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | for i in seq(0, n - n / 4 * 4): 3 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_set_precision_api_type.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, x: f32[n] @ DRAM): 2 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_shift_loop.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: f32[n] @ DRAM): 2 | for i in seq(1, 1 + n): 3 | x[-1 + i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_shift_loop_by_expr.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: f32[n + 1] @ DRAM): 2 | for i in seq(2 + n, 2 + 2 * n): 3 | x[-1 - n + i] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_shift_loop_nonzero_lo.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: f32[n + 1] @ DRAM): 2 | assert n >= m 3 | for i in seq(4, 4 - m + n): 4 | x[-4 + i + m] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_bind_expr.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, x: i8[n] @ DRAM, y: i8[n] @ DRAM, z: i8[n] @ DRAM): 2 | for i in seq(0, n): 3 | z_tmp: i8 @ DRAM 4 | z_tmp = x[i] + y[i] 5 | z[i] = z_tmp -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_divide_loop.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | tmp: i8[n] @ DRAM 3 | for io in seq(0, (n + 3) / 4): 4 | for ii in seq(0, 4): 5 | if 4 * io + ii < n: 6 | tmp[4 * io + ii] = A[4 * io + ii] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_fission.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM, B: i8[n] @ DRAM, C: i8[n] @ DRAM): 2 | for i in seq(0, n): 3 | C[i] += A[i] 4 | for i in seq(0, n): 5 | C[i] += B[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_inline.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, src: i8[n] @ DRAM, dst: i8[n] @ DRAM): 2 | for i in seq(0, n): 3 | tmp_src1: i8 @ DRAM 4 | tmp_src2: i8 @ DRAM 5 | tmp_src1 = src[i] 6 | tmp_src2 = src[i] 7 | tmp_dst: i8 @ DRAM 8 | tmp_dst = dst[i] 9 | tmp_dst = tmp_src1 + tmp_src2 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_lift_alloc.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | tmp_a: i8[n + 1] @ DRAM 3 | for i in seq(0, n): 4 | tmp_a[i] = A[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_partial_eval.txt: -------------------------------------------------------------------------------- 1 | def bar(A: i8[10] @ DRAM): 2 | tmp: i8[10] @ DRAM 3 | for i in seq(0, 10): 4 | tmp[i] = A[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_reorder.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size, A: i8[n, m] @ DRAM): 2 | tmp: i8[n, m] @ DRAM 3 | for j in seq(0, m): 4 | for i in seq(0, n): 5 | tmp[i, j] = A[i, j] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_reorder2.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, m: size, A: i8[n, m] @ DRAM): 2 | assert n > 5 3 | assert m > 7 4 | tmp: i8[n, m] @ DRAM 5 | for j in seq(2, m): 6 | for i in seq(4, n): 7 | tmp[i, j] = A[i, j] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_typ_and_mem.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: [i32][n] @ GEMM_SCRATCH): 2 | A[0] += 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_typ_and_mem_2.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size): 2 | A: i32[n] @ GEMM_SCRATCH 3 | A[0] += 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_unroll.txt: -------------------------------------------------------------------------------- 1 | def bar(A: i8[10] @ DRAM): 2 | tmp: i8[10] @ DRAM 3 | tmp[0] = A[0] 4 | tmp[1] = A[1] 5 | tmp[2] = A[2] 6 | tmp[3] = A[3] 7 | tmp[4] = A[4] 8 | tmp[5] = A[5] 9 | tmp[6] = A[6] 10 | tmp[7] = A[7] 11 | tmp[8] = A[8] 12 | tmp[9] = A[9] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simple_unroll2.txt: -------------------------------------------------------------------------------- 1 | def bar(A: i8[10] @ DRAM): 2 | tmp: i8[10] @ DRAM 3 | tmp[3] = A[3] 4 | tmp[4] = A[4] 5 | tmp[5] = A[5] 6 | tmp[6] = A[6] 7 | tmp[7] = A[7] 8 | tmp[8] = A[8] 9 | tmp[9] = A[9] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | x: R[n, 16, 10] @ DRAM 3 | for i in seq(0, 8 + 5 * n): 4 | pass 5 | y: R[10] @ DRAM 6 | y[1] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify2.txt: -------------------------------------------------------------------------------- 1 | def foo(A: i8[32, 64] @ DRAM, B: i8[16, 128] @ DRAM, C: i32[32, 32] @ DRAM, 2 | ko: size, ji_unroll: size, ii_unroll: size): 3 | for io in seq(0, 1): 4 | for jo in seq(0, 1): 5 | Btile1: i8[16, 64] @ DRAM 6 | Btile0: i8[16, 64] @ DRAM 7 | Atile0: i8[16, 64] @ DRAM 8 | Atile1: i8[16, 64] @ DRAM -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify3.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size): 2 | assert m == 1 and n == 1 3 | y: R[10] @ DRAM 4 | y[-(8 * n) + 10 * m] = 2.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify4.txt: -------------------------------------------------------------------------------- 1 | def bar(): 2 | for i in seq(0, 3): 3 | for j in seq(0, 16): 4 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_div_mod_staging.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[64] @ DRAM, y: R[64] @ DRAM, out: R[64] @ DRAM): 2 | xReg: R[16, 4] @ DRAM 3 | for io in seq(0, 16): 4 | for ii in seq(0, 4): 5 | xReg[io, ii] = x[ii + 4 * io] 6 | for io in seq(0, 16): 7 | for ii in seq(0, 4): 8 | out[ii + 4 * io] = xReg[io, ii] * y[ii + 4 * io] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[(3 + 2 * j + 8 * i) / 3] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div1.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[1 + j + 4 * i] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div2.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[12 + j + 4 * i] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div3.txt: -------------------------------------------------------------------------------- 1 | def bar(N: size, x: R[N] @ DRAM): 2 | assert N >= 1 3 | assert N % 4 == 0 4 | for io in seq(0, N / 4): 5 | for ii in seq(0, 4): 6 | x[io] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div4.txt: -------------------------------------------------------------------------------- 1 | def bar(N: size, x: R[N] @ DRAM): 2 | assert N >= 1 3 | assert N % 4 == 0 4 | for io in seq(0, N / 4): 5 | for ii in seq(0, 4): 6 | x[2 + io] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div5.txt: -------------------------------------------------------------------------------- 1 | def bar(N: size, x: R[N] @ DRAM): 2 | assert N >= 1 3 | assert N % 4 == 0 4 | for io in seq(0, N / 4): 5 | for ii in seq(0, 4): 6 | x[io] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div6.txt: -------------------------------------------------------------------------------- 1 | def bar(N: size): 2 | for i in seq(0, N): 3 | for j in seq(0, 4): 4 | if i / 4 > 0: 5 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div_fail.txt: -------------------------------------------------------------------------------- 1 | def bar(N: size, x: R[1 + N] @ DRAM): 2 | assert N >= 1 3 | assert N % 4 == 0 4 | for io in seq(0, N / 4): 5 | for ii in seq(0, 4): 6 | x[(1 + ii + 4 * io) / 4] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div_fail1.txt: -------------------------------------------------------------------------------- 1 | def bar(N: size, x: R[1 + N] @ DRAM): 2 | assert N >= 1 3 | assert N % 4 == 0 4 | for io in seq(0, N / 4): 5 | for ii in seq(0, 5): 6 | x[(ii + 4 * io) / 4] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_div_fail2.txt: -------------------------------------------------------------------------------- 1 | def bar(N: size, x: R[2 * N] @ DRAM): 2 | assert N >= 1 3 | assert N % 4 == 0 4 | for io in seq(0, N / 4): 5 | for ii in seq(0, 5): 6 | x[(N + ii + 4 * io) / 2] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_mod.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[(3 + 2 * j + 8 * i) % 5] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_mod1.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[(2 * j + 8 * i) % 3] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_mod2.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[1] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_mod3.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[(3 + 7 * i) % 2] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_mod4.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[3 + i] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_mod5.txt: -------------------------------------------------------------------------------- 1 | def bar(N: size, x: R[N] @ DRAM): 2 | assert N >= 1 3 | assert N % 4 == 0 4 | for io in seq(0, N / 4): 5 | for ii in seq(0, 4): 6 | x[ii] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_index_nested_div_mod.txt: -------------------------------------------------------------------------------- 1 | def bar(x: R[1000] @ DRAM): 2 | for i in seq(0, 4): 3 | for j in seq(0, 5): 4 | x[(3 + 2 * i) % 4] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_logical.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | if n > 0: 3 | pass 4 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_loop_bounds.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | for i in seq(7 + n, 17 + n): 3 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_nested_div.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | x: f32 @ DRAM 3 | for i in seq(0, n / 24): 4 | x = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_nested_div_2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | x: f32 @ DRAM 3 | for ii in seq(0, n): 4 | for i in seq(0, (ii + n / 4 * 4) / 8): 5 | x = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_simplify_with_window_stmts.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size): 2 | x: i8[1] @ DRAM 3 | x_window = x[0:1] 4 | x_window[0] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_sink_alloc_simple_for_loop.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | for i in seq(0, 10): 3 | a: i8[10] @ DRAM 4 | pass -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_sink_alloc_simple_if_stmt.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | if 1 < 10: 3 | a: i8[10] @ DRAM 4 | a[1] = 0.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_sink_alloc_when_if_has_else.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | if 1 < 10: 3 | a: i8[10] @ DRAM 4 | a[1] = 0.0 5 | else: 6 | a: i8[10] @ DRAM 7 | a_1[1] = 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_specialize.txt: -------------------------------------------------------------------------------- 1 | def foo(x: f32[4] @ DRAM): 2 | for i in seq(0, 4): 3 | if i == 0: 4 | x[i] += 1.0 5 | else: 6 | if i == 1: 7 | x[i] += 1.0 8 | else: 9 | if i == 2: 10 | x[i] += 1.0 11 | else: 12 | if i == 3: 13 | x[i] += 1.0 14 | else: 15 | x[i] += 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_specialize_blocks.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, a: f32 @ DRAM): 2 | if n > 0: 3 | b: f32 @ DRAM 4 | a = 1.0 5 | a = 2.0 6 | b = 1.2 7 | else: 8 | b: f32 @ DRAM 9 | a = 1.0 10 | a = 2.0 11 | b = 1.2 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_specialize_sizes.txt: -------------------------------------------------------------------------------- 1 | def gemm(M: size, N: size, K: size, C: f32[M, N] @ DRAM, A: f32[M, K] @ DRAM, 2 | B: f32[K, N] @ DRAM, alpha: f32 @ DRAM): 3 | if N <= 64: 4 | for i in seq(0, M): 5 | for j in seq(0, N): 6 | for k in seq(0, K): 7 | C[i, j] += alpha * A[i, k] * B[k, j] 8 | else: 9 | if N <= 128: 10 | for i in seq(0, M): 11 | for j in seq(0, N): 12 | for k in seq(0, K): 13 | C[i, j] += alpha * A[i, k] * B[k, j] 14 | else: 15 | if N <= 512: 16 | for i in seq(0, M): 17 | for j in seq(0, N): 18 | for k in seq(0, K): 19 | C[i, j] += alpha * A[i, k] * B[k, j] 20 | else: 21 | for i in seq(0, M): 22 | for j in seq(0, N): 23 | for k in seq(0, K): 24 | C[i, j] += alpha * A[i, k] * B[k, j] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_split_write.txt: -------------------------------------------------------------------------------- 1 | def bar(x: i8 @ DRAM): 2 | x = 1 3 | x += 2 4 | x += 3 5 | x += 4 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem.txt: -------------------------------------------------------------------------------- 1 | def sqmat(n: size, A: R[n, n] @ DRAM, B: R[n, n] @ DRAM): 2 | assert n % 4 == 0 3 | for i in seq(0, n / 4): 4 | for j in seq(0, n / 4): 5 | Atile: R[4, 4] @ DRAM 6 | for i0 in seq(0, 4): 7 | for i1 in seq(0, 4): 8 | Atile[i0, i1] = A[i0 + 4 * i, i1 + 4 * j] 9 | for k in seq(0, n / 4): 10 | for ii in seq(0, 4): 11 | for jj in seq(0, 4): 12 | for kk in seq(0, 4): 13 | Atile[ii, 14 | jj] += B[ii + 4 * i, kk + 15 | 4 * k] * B[kk + 4 * k, jj + 4 * j] 16 | for i0 in seq(0, 4): 17 | for i1 in seq(0, 4): 18 | A[i0 + 4 * i, i1 + 4 * j] = Atile[i0, i1] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_accum.txt: -------------------------------------------------------------------------------- 1 | def sqmat(n: size, A: R[n, n] @ DRAM, B: R[n, n] @ DRAM): 2 | assert n % 4 == 0 3 | for i in seq(0, n / 4): 4 | for j in seq(0, n / 4): 5 | Atile: R[4, 4] @ DRAM 6 | for i0 in seq(0, 4): 7 | for i1 in seq(0, 4): 8 | Atile[i0, i1] = 0.0 9 | for k in seq(0, n / 4): 10 | for ii in seq(0, 4): 11 | for jj in seq(0, 4): 12 | for kk in seq(0, 4): 13 | Atile[ii, 14 | jj] += B[ii + 4 * i, kk + 15 | 4 * k] * B[kk + 4 * k, jj + 4 * j] 16 | for i0 in seq(0, 4): 17 | for i1 in seq(0, 4): 18 | A[i0 + 4 * i, i1 + 4 * j] += Atile[i0, i1] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_accum2.txt: -------------------------------------------------------------------------------- 1 | def accum(out: R[4, 16, 16] @ DRAM, w: R[16] @ DRAM, im: R[16] @ DRAM): 2 | for k in seq(0, 4): 3 | o: R[16, 16] @ DRAM 4 | for i0 in seq(0, 16): 5 | for i1 in seq(0, 16): 6 | o[i0, i1] = out[k, i0, i1] 7 | for i in seq(0, 16): 8 | for j in seq(0, 16): 9 | o[i, j] += w[j] * im[i] 10 | for i0 in seq(0, 16): 11 | for i1 in seq(0, 16): 12 | out[k, i0, i1] = o[i0, i1] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_assign.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: [f32][n] @ DRAM, result: f32 @ DRAM): 2 | result = 0.0 3 | for i in seq(0, n): 4 | tile: f32 @ DRAM 5 | tile = x[i] 6 | result = tile + tile -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_assign2.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: [f32][n] @ DRAM, result: f32 @ DRAM): 2 | result = 0.0 3 | for i in seq(0, n): 4 | tile: f32 @ DRAM 5 | tile = x[i] 6 | result = tile 7 | result = tile -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_asum.txt: -------------------------------------------------------------------------------- 1 | def asum(n: size, x: [f32][n] @ DRAM, result: f32 @ DRAM): 2 | result = 0.0 3 | for i in seq(0, n): 4 | tile: f32 @ DRAM 5 | tile = x[i] 6 | result += select(0.0, tile, tile, -tile) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_okay.txt: -------------------------------------------------------------------------------- 1 | def foo(x: i8[10, 10, 10] @ DRAM): 2 | y: i8 @ DRAM 3 | x_tmp: i8[10, 10] @ DRAM 4 | for i0 in seq(0, 10): 5 | for i1 in seq(0, 10): 6 | x_tmp[i0, i1] = x[i0, 0, i1] 7 | for i in seq(0, 10): 8 | x_tmp[i, i] = 1.0 9 | y = x_tmp[2, 3] 10 | for i0 in seq(0, 10): 11 | for i1 in seq(0, 10): 12 | x[i0, 0, i1] = x_tmp[i0, i1] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_out_of_bound_block.txt: -------------------------------------------------------------------------------- 1 | def axpy(n: size, x: f32[n] @ DRAM, y: f32[n] @ DRAM): 2 | for io in seq(0, (4 + n) / 5): 3 | xReg: f32[5] @ DRAM 4 | for i0 in seq(0, 5): 5 | if i0 + 5 * io < n: 6 | xReg[i0] = x[i0 + 5 * io] 7 | for ii in seq(0, 5): 8 | if ii + 5 * io < n: 9 | y[ii + 5 * io] += xReg[ii] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_out_of_bound_point.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: f32[n] @ DRAM, y: f32[n] @ DRAM): 2 | assert m >= n 3 | for i in seq(0, m): 4 | tmp: f32 @ DRAM 5 | if i < n: 6 | tmp = x[i] 7 | if i < n: 8 | y[i] = tmp -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_out_of_bound_reduction_accum.txt: -------------------------------------------------------------------------------- 1 | def axpy(n: size, x: f32[n] @ DRAM, y: f32[n] @ DRAM): 2 | for io in seq(0, (4 + n) / 5): 3 | yReg: f32[5] @ DRAM 4 | for i0 in seq(0, 5): 5 | yReg[i0] = 0.0 6 | for ii in seq(0, 5): 7 | if ii + 5 * io < n: 8 | yReg[ii] += x[ii + 5 * io] 9 | for i0 in seq(0, 5): 10 | if i0 + 5 * io < n: 11 | y[i0 + 5 * io] += yReg[i0] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_out_of_bounds_load_1D.txt: -------------------------------------------------------------------------------- 1 | def memcpy(n: size, x: f32[n] @ DRAM, y: f32[n] @ DRAM): 2 | for io in seq(0, (3 + n) / 4): 3 | yReg: f32[4] @ DRAM 4 | for i0 in seq(0, 4): 5 | if i0 + 4 * io < n: 6 | yReg[i0] = y[i0 + 4 * io] 7 | for ii in seq(0, 4): 8 | if ii + 4 * io < n: 9 | x[ii + 4 * io] = yReg[ii] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_out_of_bounds_load_2D_one_cond.txt: -------------------------------------------------------------------------------- 1 | def memcpy_2D(m: size, n: size, x: f32[m, n] @ DRAM, y: f32[m, n] @ DRAM): 2 | for i in seq(0, m): 3 | for jo in seq(0, (3 + n) / 4): 4 | yReg: f32[4] @ DRAM 5 | for i0 in seq(0, 4): 6 | if i0 + 4 * jo < n: 7 | yReg[i0] = y[i, i0 + 4 * jo] 8 | for ji in seq(0, 4): 9 | if ji + 4 * jo < n: 10 | x[i, ji + 4 * jo] = yReg[ji] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_out_of_bounds_load_2D_two_conds.txt: -------------------------------------------------------------------------------- 1 | def memcpy_2D(m: size, n: size, x: f32[m, n] @ DRAM, y: f32[m, n] @ DRAM): 2 | for io in seq(0, (6 + m) / 7): 3 | for jo in seq(0, (3 + n) / 4): 4 | yReg: f32[7, 4] @ DRAM 5 | for i0 in seq(0, 7): 6 | for i1 in seq(0, 4): 7 | if i0 + 7 * io < m and i1 + 4 * jo < n: 8 | yReg[i0, i1] = y[i0 + 7 * io, i1 + 4 * jo] 9 | for ii in seq(0, 7): 10 | if ii + 7 * io < m: 11 | for ji in seq(0, 4): 12 | if ji + 4 * jo < n: 13 | x[ii + 7 * io, ji + 4 * jo] = yReg[ii, ji] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_out_of_bounds_reduction.txt: -------------------------------------------------------------------------------- 1 | def axpy(n: size, x: f32[n] @ DRAM, y: f32[n] @ DRAM): 2 | for io in seq(0, (4 + n) / 5): 3 | yReg: f32[5] @ DRAM 4 | for i0 in seq(0, 5): 5 | if i0 + 5 * io < n: 6 | yReg[i0] = y[i0 + 5 * io] 7 | for ii in seq(0, 5): 8 | if ii + 5 * io < n: 9 | yReg[ii] += x[ii + 5 * io] 10 | for i0 in seq(0, 5): 11 | if i0 + 5 * io < n: 12 | y[i0 + 5 * io] = yReg[i0] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_out_of_bounds_store_1D.txt: -------------------------------------------------------------------------------- 1 | def memcpy(n: size, x: f32[n] @ DRAM, y: f32[n] @ DRAM): 2 | for io in seq(0, (3 + n) / 4): 3 | xReg: f32[4] @ DRAM 4 | for ii in seq(0, 4): 5 | if ii + 4 * io < n: 6 | xReg[ii] = y[ii + 4 * io] 7 | for i0 in seq(0, 4): 8 | if i0 + 4 * io < n: 9 | x[i0 + 4 * io] = xReg[i0] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_point.txt: -------------------------------------------------------------------------------- 1 | def matmul(n: size, A: R[n, n] @ DRAM, B: R[n, n] @ DRAM, C: R[n, n] @ DRAM): 2 | for i in seq(0, n): 3 | for j in seq(0, n): 4 | res: R @ DRAM 5 | res = C[i, j] 6 | for k in seq(0, n): 7 | res += A[i, k] * B[k, j] 8 | C[i, j] = res -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_recursive.txt: -------------------------------------------------------------------------------- 1 | def recursive(n: size, y: R[n] @ DRAM, x: R[n] @ DRAM): 2 | assert n > 2 3 | assert (-2 + n) % 4 == 0 4 | for io in seq(0, (-2 + n) / 4): 5 | y_tmp: R[4] @ DRAM 6 | y_tmp[0] = y[1 + 4 * io] + y[4 * io] + x[4 * io] 7 | y_tmp[1] = y[1 + 4 * io] + y[4 * io] + x[4 * io] + y[1 + 4 * io] + x[ 8 | 1 + 4 * io] 9 | y_tmp[2] = y[1 + 4 * io] + y[4 * io] + x[4 * io] + y[1 + 4 * io] + x[ 10 | 1 + 4 * io] + (y[1 + 4 * io] + y[4 * io] + x[4 * io]) + x[2 + 11 | 4 * io] 12 | y_tmp[3] = y[1 + 4 * io] + y[4 * io] + x[4 * io] + y[1 + 4 * io] + x[ 13 | 1 + 4 * io] + (y[1 + 4 * io] + y[4 * io] + x[4 * io]) + x[ 14 | 2 + 4 * io] + (y[1 + 4 * io] + y[4 * io] + x[4 * io] + 15 | y[1 + 4 * io] + x[1 + 4 * io]) + x[3 + 4 * io] 16 | for i0 in seq(0, 4): 17 | y[2 + i0 + 4 * io] = y_tmp[i0] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_reduce.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, x: [f32][n] @ DRAM, result: f32 @ DRAM): 2 | result = 0.0 3 | for i in seq(0, n): 4 | tile: f32 @ DRAM 5 | tile = x[i] 6 | result += tile + tile -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_reduce2.txt: -------------------------------------------------------------------------------- 1 | def foo(x: f32[30] @ DRAM, result: f32 @ DRAM): 2 | tmp: f32 @ DRAM 3 | tmp = 0.0 4 | for i in seq(0, 30): 5 | x[i] = tmp 6 | result = tmp -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_stage_mem_twice.txt: -------------------------------------------------------------------------------- 1 | def sqmat(n: size, A: R[n, n] @ DRAM, B: R[n, n] @ DRAM): 2 | assert n % 4 == 0 3 | for i in seq(0, n / 4): 4 | for j in seq(0, n / 4): 5 | for k in seq(0, n / 4): 6 | B1: R[4, 4] @ DRAM 7 | for ii in seq(0, 4): 8 | for kk in seq(0, 4): 9 | B1[ii, kk] = B[ii + 4 * i, kk + 4 * k] 10 | B2: R[4, 4] @ DRAM 11 | for i0 in seq(0, 4): 12 | for i1 in seq(0, 4): 13 | B2[i0, i1] = B[i0 + 4 * k, i1 + 4 * j] 14 | for ii in seq(0, 4): 15 | for jj in seq(0, 4): 16 | for kk in seq(0, 4): 17 | A[ii + 4 * i, 18 | jj + 4 * j] += B1[ii, kk] * B2[kk, jj] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_transpose.txt: -------------------------------------------------------------------------------- 1 | def bar(m: size, n: size, A: i8[n, m] @ DRAM): 2 | for i in seq(0, m): 3 | for j in seq(0, n): 4 | A[j, i] += 1.0 -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify1.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R[5, 5] @ DRAM, y: R[5, 5] @ DRAM): 2 | bar(5, y, x) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify10.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: f32[n] @ DRAM): 2 | assert -m + n >= 1 3 | assert -m + n <= 8 4 | y: f32[8] @ DRAM 5 | bar(y[0:8], x[0:8], 1 - m + n) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify11.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: f32[n] @ DRAM): 2 | assert -m + n >= 1 3 | assert -m + n <= 8 4 | y: f32[8] @ DRAM 5 | bar(y[0:8], x[0:8], -n + m) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify12.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: f32[n] @ DRAM): 2 | assert -m + n >= 1 3 | assert -m + n <= 8 4 | y: f32[8] @ DRAM 5 | bar(y[0:8], x[0:8], 1 - n + m) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify2.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R[12, 12] @ DRAM, y: R[12, 12] @ DRAM): 2 | bar(5, y[5:10, 2:7], x[3:8, 1:6]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify3.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, z: R[n] @ DRAM, x: R[n] @ DRAM, y: R[n] @ DRAM): 2 | assert n % 4 == 0 3 | for i in seq(0, n / 4): 4 | simd_add4(z[4 * i + 0:4 * i + 4], x[4 * i + 0:4 * i + 4], 5 | y[4 * i + 0:4 * i + 4]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify4.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R[50, 2] @ DRAM, y: R[50, 2] @ DRAM): 2 | bar(50, x[0:50, 0], y[0:50, 1]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify5.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R[5, 5] @ DRAM, y: R[5, 5] @ DRAM): 2 | bar(5, y, x) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify6.txt: -------------------------------------------------------------------------------- 1 | def bar(K: size, A: [i8][16, K] @ DRAM): 2 | for k in seq(0, K / 16): 3 | a: i8[16, 16] @ DRAM 4 | load(16, 16, A[0:16, 16 * k + 0:16 * k + 16], a[0:16, 0:16]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify7.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R[5, 5] @ DRAM, y: R[5, 5] @ DRAM): 2 | bar(False, 5, y, x, 0) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify8.txt: -------------------------------------------------------------------------------- 1 | def foo(x: R[5, 5] @ DRAM, y: R[5, 5] @ DRAM): 2 | bar(5, 3, y, x) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unify9.txt: -------------------------------------------------------------------------------- 1 | def foo(n: size, m: size, x: f32[n] @ DRAM): 2 | assert -m + n >= 1 3 | assert -m + n <= 8 4 | y: f32[8] @ DRAM 5 | bar(y[0:8], x[0:8], -m + n) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unroll_buffer.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | for i in seq(0, n): 3 | for j in seq(0, n): 4 | tmp_a_0: i8[2] @ DRAM 5 | tmp_a_1: i8[2] @ DRAM 6 | tmp_a_0[1] = A[i] 7 | tmp_a_0[1] = A[i] 8 | tmp_a_1[0] = A[i] 9 | def bar(n: size, A: i8[n] @ DRAM): 10 | for i in seq(0, n): 11 | for j in seq(0, n): 12 | tmp_a_0: i8[5] @ DRAM 13 | tmp_a_1: i8[5] @ DRAM 14 | tmp_a_1[0] = A[i] 15 | tmp_a_1[0] = A[i] 16 | tmp_a_0[1] = A[i] -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unroll_buffer1.txt: -------------------------------------------------------------------------------- 1 | def bar(n: size, A: i8[n] @ DRAM): 2 | assert n > 10 3 | for i in seq(0, n - 4): 4 | for j in seq(0, n): 5 | tmp_a_0: i8[4, 2] @ DRAM 6 | tmp_a_1: i8[4, 2] @ DRAM 7 | foo(tmp_a_0[0, 0:2], A[i:i + 2]) 8 | foo(tmp_a_1[0, 0:2], A[i + 2:i + 4]) -------------------------------------------------------------------------------- /tests/golden/test_schedules/test_unroll_buffer6.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | a_0: f32 @ DRAM 3 | a_1: f32 @ DRAM 4 | b_0: f32 @ DRAM 5 | b_1: f32 @ DRAM 6 | a_0 = b_0 7 | a_1 = b_1 -------------------------------------------------------------------------------- /tests/golden/test_sve_vla/test_compile_sve_vla_svmla.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include "exo_arm_sve.h" 4 | #include 5 | #include 6 | 7 | // svmla( 8 | // N : size, 9 | // C : f32[N] @DRAM, 10 | // A : f32[N] @DRAM, 11 | // B : f32 @DRAM 12 | // ) 13 | void svmla( void *ctxt, int_fast32_t N, float* C, const float* A, const float* B ) { 14 | svmla_n_f32_x_vla((N), &C[0], &A[0], *(B)); 15 | } 16 | 17 | 18 | /* relying on the following instruction..." 19 | svmla_n_f32_x_vla(N,dst,src1,src2) 20 | svmla_n_f32_x_vla({N_data}, &{dst_data}, &{src1_data}, *{src2_data}); 21 | */ 22 | -------------------------------------------------------------------------------- /tests/golden/test_sve_vla/test_gen_sve_vla_svmla.txt: -------------------------------------------------------------------------------- 1 | def svmla(N: size, C: f32[N] @ DRAM, A: f32[N] @ DRAM, B: f32 @ DRAM): 2 | svmla_n_f32_x_vla(N, C[0:N], A[0:N], B) -------------------------------------------------------------------------------- /tests/golden/test_sve_vls/test_compile_sve_vls_svmla.txt: -------------------------------------------------------------------------------- 1 | #include "test.h" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | typedef svfloat32_t svfloat32_vls_t __attribute__((arm_sve_vector_bits(512))); 8 | typedef svfloat64_t svfloat64_vls_t __attribute__((arm_sve_vector_bits(512))); 9 | 10 | /* relying on the following instruction..." 11 | svld1_f32(dst,src) 12 | {dst_data} = svld1_f32(svptrue_b32(), &{src_data}); 13 | */ 14 | // svmla( 15 | // C : f32[16] @DRAM, 16 | // A : f32[16] @DRAM, 17 | // B : f32 @DRAM 18 | // ) 19 | void svmla( void *ctxt, float* C, const float* A, const float* B ) { 20 | svfloat32_vls_t C_reg; 21 | C_reg = svld1_f32(svptrue_b32(), &C[0]); 22 | svfloat32_vls_t A_reg; 23 | A_reg = svld1_f32(svptrue_b32(), &A[0]); 24 | C_reg = svmla_n_f32_x(svptrue_b32(), C_reg, A_reg, *(B)); 25 | svst1_f32(svptrue_b32(), &C[0], C_reg); 26 | } 27 | 28 | 29 | /* relying on the following instruction..." 30 | svmla_n_f32_x(dst,src1,src2) 31 | {dst_data} = svmla_n_f32_x(svptrue_b32(), {dst_data}, {src1_data}, *{src2_data}); 32 | */ 33 | 34 | /* relying on the following instruction..." 35 | svst1_f32(dst,src) 36 | svst1_f32(svptrue_b32(), &{dst_data}, {src_data}); 37 | */ 38 | -------------------------------------------------------------------------------- /tests/golden/test_sve_vls/test_gen_sve_vls_svmla.txt: -------------------------------------------------------------------------------- 1 | def svmla(C: f32[16] @ DRAM, A: f32[16] @ DRAM, B: f32 @ DRAM): 2 | C_reg: f32[16] @ SVE_VLS 3 | svld1_f32(C_reg[0:16], C[0:16]) 4 | A_reg: f32[16] @ SVE_VLS 5 | svld1_f32(A_reg[0:16], A[0:16]) 6 | svmla_n_f32_x(C_reg[0:16], A_reg[0:16], B) 7 | svst1_f32(C[0:16], C_reg[0:16]) -------------------------------------------------------------------------------- /tests/golden/test_uast/test_alloc_nest.txt: -------------------------------------------------------------------------------- 1 | def alloc_nest(n: size, m: size, x: R[n, m], y: R[n, m] @ DRAM, 2 | res: R[n, m] @ DRAM): 3 | # @instr TEST 4 | for i in seq(0, n): 5 | rloc: R[m] @ DRAM 6 | xloc: R[m] @ DRAM 7 | yloc: R[m] @ DRAM 8 | for j in seq(0, m): 9 | xloc[j] = x[i, j] 10 | for j in seq(0, m): 11 | yloc[j] = y[i, j] 12 | for j in seq(0, m): 13 | rloc[j] = xloc[j] + yloc[j] 14 | for j in seq(0, m): 15 | res[i, j] = rloc[j] 16 | -------------------------------------------------------------------------------- /tests/golden/test_uast/test_conv1d.txt: -------------------------------------------------------------------------------- 1 | def conv1d(n: size, m: size, r: size, x: R[n], w: R[m], res: R[r]): 2 | # @instr TEST 3 | for i in seq(0, r): 4 | res[i] = 0.0 5 | for i in seq(0, r): 6 | for j in seq(0, n): 7 | if i <= j and j < i + m: 8 | res[i] += x[j] * w[i - j + m - 1] 9 | -------------------------------------------------------------------------------- /tests/golden/test_uast/test_unary_neg.txt: -------------------------------------------------------------------------------- 1 | def negate_array(n: size, x: R[n], res: R[n] @ DRAM): 2 | # @instr TEST 3 | for i in seq(0, n): 4 | res[i] = -x[i] + -x[i] - -(x[i] + 0.0) 5 | -------------------------------------------------------------------------------- /tests/golden/test_window/test_window_stmt.txt: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | // Compiler feature macros adapted from Hedley (public domain) 6 | // https://github.com/nemequ/hedley 7 | 8 | #if defined(__has_builtin) 9 | # define EXO_HAS_BUILTIN(builtin) __has_builtin(builtin) 10 | #else 11 | # define EXO_HAS_BUILTIN(builtin) (0) 12 | #endif 13 | 14 | #if EXO_HAS_BUILTIN(__builtin_assume) 15 | # define EXO_ASSUME(expr) __builtin_assume(expr) 16 | #elif EXO_HAS_BUILTIN(__builtin_unreachable) 17 | # define EXO_ASSUME(expr) \ 18 | ((void)((expr) ? 1 : (__builtin_unreachable(), 1))) 19 | #else 20 | # define EXO_ASSUME(expr) ((void)(expr)) 21 | #endif 22 | 23 | 24 | #ifndef EXO_WIN_1F32C 25 | #define EXO_WIN_1F32C 26 | struct exo_win_1f32c{ 27 | const float * const data; 28 | const int_fast32_t strides[1]; 29 | }; 30 | #endif 31 | // window_stmt( 32 | // n : size, 33 | // m : size, 34 | // x : f32[n, m] @DRAM 35 | // ) 36 | void window_stmt( void *ctxt, int_fast32_t n, int_fast32_t m, const float* x ); 37 | 38 | 39 | #include 40 | #include 41 | 42 | // window_stmt( 43 | // n : size, 44 | // m : size, 45 | // x : f32[n, m] @DRAM 46 | // ) 47 | void window_stmt( void *ctxt, int_fast32_t n, int_fast32_t m, const float* x ) { 48 | struct exo_win_1f32c y = (struct exo_win_1f32c){ &x[0], { m } }; 49 | float *z = (float*) malloc(n * sizeof(*z)); 50 | for (int_fast32_t i = 0; i < n; i++) { 51 | z[i] = y.data[i * y.strides[0]]; 52 | } 53 | free(z); 54 | } 55 | 56 | -------------------------------------------------------------------------------- /tests/golden/test_x86/test_avx2_divide_by_3.txt: -------------------------------------------------------------------------------- 1 | def foo(): 2 | out: ui16[16] @ AVX2 3 | x: ui16[16] @ AVX2 4 | avx2_ui16_divide_by_3(out[0:16], x[0:16]) -------------------------------------------------------------------------------- /tests/golden/test_x86/test_gen_avx2_simple_math_scheduling.txt: -------------------------------------------------------------------------------- 1 | def simple_math_avx2_sched(n: size, x: R[n] @ DRAM, y: R[n] @ DRAM): 2 | for io in seq(0, n / 8): 3 | xVec: R[8] @ AVX2 4 | mm256_loadu_ps(xVec[0:8], x[8 * io:8 + 8 * io]) 5 | yVec: R[8] @ AVX2 6 | mm256_loadu_ps(yVec[0:8], y[8 * io:8 + 8 * io]) 7 | xy: R[8] @ AVX2 8 | mm256_mul_ps(xy[0:8], xVec[0:8], yVec[0:8]) 9 | mm256_mul_ps(xVec[0:8], xy[0:8], yVec[0:8]) 10 | mm256_storeu_ps(x[8 * io:8 + 8 * io], xVec[0:8]) 11 | if n % 8 > 0: 12 | for ii in seq(0, n % 8): 13 | x[ii + n / 8 * 14 | 8] = x[ii + n / 8 * 8] * y[ii + n / 8 * 8] * y[ii + n / 8 * 8] -------------------------------------------------------------------------------- /tests/input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/exo-lang/exo/c2b48e4b210476dc7ec5030ef828a5545841e498/tests/input.png -------------------------------------------------------------------------------- /tests/test_error_reporting.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import textwrap 4 | 5 | import pytest 6 | 7 | from exo import SchedulingError 8 | from exo import proc 9 | from exo.frontend.syntax import * 10 | from exo.stdlib.scheduling import * 11 | 12 | # skipping because the API has changed to invalidate this particular 13 | # error message 14 | @pytest.mark.skip() 15 | def test_bad_reorder(): 16 | @proc 17 | def example(N: size, A: f32[N]): 18 | for i in seq(0, N): 19 | A[i] = 0.0 20 | 21 | expected_error = textwrap.dedent( 22 | """ 23 | reorder: failed to find statement 24 | Pattern: for i in _: 25 | for j in _: _ 26 | """ 27 | ).strip() 28 | 29 | with pytest.raises(SchedulingError, match=expected_error): 30 | reorder_loops(example, "i", "j") 31 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | import exo 8 | import exo.main 9 | 10 | REPO_ROOT = Path(__file__).parent.parent.resolve() 11 | 12 | 13 | def _test_app(module_file: Path): 14 | module_file = module_file.resolve(strict=True) 15 | mod = exo.main.load_user_code(module_file) 16 | procs = exo.main.get_procs_from_module(mod) 17 | 18 | c_file, h_file = exo.compile_procs_to_strings(procs, "test_case.h") 19 | 20 | return f"{h_file}\n{c_file}" 21 | 22 | 23 | # ---------------------------------------------------------------------------- # 24 | 25 | 26 | def test_avx2_matmul(golden): 27 | module_file = REPO_ROOT / "examples" / "avx2_matmul" / "x86_matmul.py" 28 | assert _test_app(module_file) == golden 29 | 30 | 31 | def test_cursors(golden): 32 | module_file = REPO_ROOT / "examples" / "cursors" / "cursors.py" 33 | assert _test_app(module_file) == golden 34 | 35 | 36 | def test_rvm_conv1d(golden): 37 | module_file = REPO_ROOT / "examples" / "rvm_conv1d" / "exo" / "conv1d.py" 38 | assert _test_app(module_file) == golden 39 | 40 | 41 | def test_quiz1(golden): 42 | module_file = REPO_ROOT / "examples" / "quiz1" / "quiz1.py" 43 | assert _test_app(module_file) == golden 44 | 45 | 46 | def test_quiz3(golden): 47 | module_file = REPO_ROOT / "examples" / "quiz3" / "quiz3.py" 48 | assert _test_app(module_file) == golden 49 | -------------------------------------------------------------------------------- /tests/test_parallel.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from exo import proc, Procedure, DRAM, compile_procs_to_strings 6 | from exo.stdlib.scheduling import * 7 | 8 | 9 | def test_pragma_parallel_loop(golden): 10 | @proc 11 | def foo(x: i8[10]): 12 | for i in par(0, 10): 13 | x[i] = 1.0 14 | 15 | c_file, _ = compile_procs_to_strings([foo], "test.h") 16 | 17 | assert c_file == golden 18 | 19 | 20 | def test_parallel_fail(): 21 | @proc 22 | def foo(A: i8[10]): 23 | total: i8 24 | for i in par(0, 10): 25 | total += A[i] 26 | 27 | with pytest.raises( 28 | TypeError, 29 | match=r"parallel loop\'s body is not parallelizable because of potential data races", 30 | ): 31 | c_file, _ = compile_procs_to_strings([foo], "test.h") 32 | 33 | 34 | def test_parallel_fail_2(): 35 | @proc 36 | def foo(A: i8[10]): 37 | total: i8 38 | for i in par(0, 10): 39 | total = A[i] 40 | 41 | with pytest.raises( 42 | TypeError, 43 | match=r"parallel loop\'s body is not parallelizable because of potential data races", 44 | ): 45 | c_file, _ = compile_procs_to_strings([foo], "test.h") 46 | -------------------------------------------------------------------------------- /tests/test_sve_vla.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from exo import proc, compile_procs_to_strings 6 | from exo.platforms.sve_vla import * 7 | from exo.stdlib.scheduling import * 8 | 9 | 10 | @pytest.fixture 11 | def test_sve_vla_svmla(): 12 | @proc 13 | def svmla( 14 | N: size, 15 | C: f32[N] @ DRAM, 16 | A: f32[N] @ DRAM, 17 | B: f32, 18 | ): 19 | for i in seq(0, N): 20 | C[i] += A[i] * B 21 | 22 | def simple_svmla(p=svmla): 23 | p = replace_all(p, svmla_n_f32_x_vla) 24 | return p 25 | 26 | simple_sve_vla_svmla = simple_svmla() 27 | 28 | return simplify(simple_sve_vla_svmla) 29 | 30 | 31 | def test_gen_sve_vla_svmla(golden, test_sve_vla_svmla): 32 | assert str(test_sve_vla_svmla) == golden 33 | 34 | 35 | def test_compile_sve_vla_svmla(golden, test_sve_vla_svmla): 36 | c_file, _ = compile_procs_to_strings([test_sve_vla_svmla], "test.h") 37 | 38 | assert c_file == golden 39 | -------------------------------------------------------------------------------- /tests/test_sve_vls.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import pytest 4 | 5 | from exo import proc, compile_procs_to_strings 6 | from exo.platforms.sve_vls import * 7 | from exo.stdlib.scheduling import * 8 | 9 | SVE_VLS = SVE_VLS(512) 10 | 11 | 12 | @pytest.fixture 13 | def test_sve_vls_svmla(): 14 | @proc 15 | def svmla( 16 | C: f32[16] @ DRAM, 17 | A: f32[16] @ DRAM, 18 | B: f32, 19 | ): 20 | for i in seq(0, 16): 21 | C[i] += A[i] * B 22 | 23 | def simple_svmla(p=svmla): 24 | p = stage_mem(p, "for i in _:_", "C[0:16]", "C_reg") 25 | p = set_memory(p, "C_reg:_", SVE_VLS.Vector) 26 | p = stage_mem(p, "for i in _:_", "A[0:16]", "A_reg") 27 | p = set_memory(p, "A_reg:_", SVE_VLS.Vector) 28 | p = replace_all(p, SVE_VLS.svld1_f32) 29 | p = replace_all(p, SVE_VLS.svst1_f32) 30 | p = replace_all(p, SVE_VLS.svmla_n_f32_x) 31 | return p 32 | 33 | simple_sve_vls_svmla = simple_svmla() 34 | 35 | return simplify(simple_sve_vls_svmla) 36 | 37 | 38 | def test_gen_sve_vls_svmla(golden, test_sve_vls_svmla): 39 | assert str(test_sve_vls_svmla) == golden 40 | 41 | 42 | def test_compile_sve_vls_svmla(golden, test_sve_vls_svmla): 43 | c_file, _ = compile_procs_to_strings([test_sve_vls_svmla], "test.h") 44 | 45 | assert c_file == golden 46 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py310 3 | 4 | [testenv] 5 | deps = 6 | # This list is maintained separately from requirements.txt to make it 7 | # harder to accidentally forget a package dependency in setup.cfg. 8 | Pillow 9 | pytest 10 | pytest-cov 11 | numpy 12 | commands = pytest --cov=./ --cov=exo --cov-report=xml -o pythonpath= 13 | passenv = 14 | SDE_PATH 15 | --------------------------------------------------------------------------------