├── Creating_AFI.md
├── Optimization_lab.md
├── README.md
├── debug_lab.md
├── images
    ├── Fig-binary_container.png
    ├── Fig-build.png
    ├── Fig-hw_button.png
    ├── Fig-refresh.png
    ├── Fig-run.png
    ├── SDX_IDE.png
    ├── connecting_lab
    │   ├── FigConnectingLab-1.png
    │   ├── FigConnectingLab-10.png
    │   ├── FigConnectingLab-11.png
    │   ├── FigConnectingLab-12.png
    │   ├── FigConnectingLab-13-1.png
    │   ├── FigConnectingLab-13-2.png
    │   ├── FigConnectingLab-14.png
    │   ├── FigConnectingLab-15.png
    │   ├── FigConnectingLab-16.png
    │   ├── FigConnectingLab-17.png
    │   ├── FigConnectingLab-2.png
    │   ├── FigConnectingLab-3.png
    │   ├── FigConnectingLab-4-1.png
    │   ├── FigConnectingLab-4-2.png
    │   ├── FigConnectingLab-5.png
    │   ├── FigConnectingLab-6.png
    │   ├── FigConnectingLab-7.png
    │   ├── FigConnectingLab-8.png
    │   ├── FigConnectingLab-9.png
    │   └── nimbix
    │   │   ├── connect_to_instance.png
    │   │   ├── linux_desktop.png
    │   │   ├── select_desktop_mode.png
    │   │   ├── select_instance.png
    │   │   └── select_instance_config.png
    ├── debug_lab
    │   ├── FigDebugLab-10.png
    │   ├── FigDebugLab-11.png
    │   ├── FigDebugLab-12.png
    │   ├── FigDebugLab-13.png
    │   ├── FigDebugLab-14.png
    │   ├── FigDebugLab-15.png
    │   ├── FigDebugLab-16.png
    │   ├── FigDebugLab-17.png
    │   ├── FigDebugLab-18.png
    │   ├── FigDebugLab-2.png
    │   ├── FigDebugLab-3.png
    │   ├── FigDebugLab-4.png
    │   ├── FigDebugLab-5.png
    │   ├── FigDebugLab-6.png
    │   ├── FigDebugLab-7.png
    │   ├── FigDebugLab-8.png
    │   ├── FigDebugLab-9.png
    │   ├── add_virtual_cable.png
    │   ├── enable_chipscope.png
    │   ├── hw_manager_open_target.png
    │   ├── localhost_connected.png
    │   ├── rtl_kernel_exe_properties.png
    │   ├── run_trigger_immediate.png
    │   ├── set_virtual_cable_port.png
    │   ├── trigger_button.png
    │   └── turn_off_autobuild.png
    ├── f1_platform.png
    ├── helloworld
    │   ├── FigGUIflowLab-10.png
    │   ├── FigGUIflowLab-11.png
    │   ├── FigGUIflowLab-13.png
    │   ├── FigGUIflowLab-14.png
    │   ├── FigGUIflowLab-15-1.png
    │   ├── FigGUIflowLab-15-2.png
    │   ├── FigGUIflowLab-15-3.png
    │   ├── FigGUIflowLab-16.png
    │   ├── FigGUIflowLab-18.png
    │   ├── FigGUIflowLab-19.png
    │   ├── FigGUIflowLab-20.png
    │   ├── FigGUIflowLab-21-1.png
    │   ├── FigGUIflowLab-21.png
    │   ├── FigGUIflowLab-22.png
    │   ├── FigGUIflowLab-23.png
    │   ├── FigGUIflowLab-24.png
    │   ├── FigGUIflowLab-25.png
    │   ├── FigGUIflowLab-26.png
    │   ├── FigGUIflowLab-27.png
    │   ├── FigGUIflowLab-28.png
    │   ├── FigGUIflowLab-29.png
    │   ├── FigGUIflowLab-30.png
    │   ├── FigGUIflowLab-31.png
    │   ├── FigGUIflowLab-6.png
    │   ├── FigGUIflowLab-8.png
    │   ├── FigGUIflowLab-9.png
    │   ├── add_xclbin_argument.png
    │   ├── empty_application_project.png
    │   ├── file_permissions.png
    │   ├── import_from_dir.png
    │   ├── import_srcs.png
    │   ├── sdx_hello_world_ide.png
    │   ├── select_srcs.png
    │   ├── select_u200_platform.png
    │   ├── select_vector_add_fn.png
    │   └── sys_estimate.png
    ├── makefile_lab
    │   ├── FigMakefileLab-1.png
    │   ├── FigMakefileLab-2.png
    │   ├── FigMakefileLab-3.png
    │   ├── FigMakefileLab-4.png
    │   ├── FigMakefileLab-5.png
    │   ├── FigMakefileLab-6.png
    │   └── linker_flag.png
    ├── nice_dcv.png
    ├── nice_dcv_desktop.png
    ├── optimization_lab
    │   ├── FigOptimizationLab-11.png
    │   ├── FigOptimizationLab-12.png
    │   ├── FigOptimizationLab-13.png
    │   ├── FigOptimizationLab-14.png
    │   ├── FigOptimizationLab-15-1.png
    │   ├── FigOptimizationLab-15.png
    │   ├── FigOptimizationLab-16.png
    │   ├── FigOptimizationLab-17.png
    │   ├── FigOptimizationLab-18.png
    │   ├── FigOptimizationLab-19.png
    │   ├── FigOptimizationLab-20-1.png
    │   ├── FigOptimizationLab-20.png
    │   ├── FigOptimizationLab-21.png
    │   ├── FigOptimizationLab-22-1.png
    │   ├── FigOptimizationLab-22.png
    │   ├── FigOptimizationLab-23.png
    │   ├── FigOptimizationLab-5.png
    │   ├── FigOptimizationLab-6.png
    │   ├── FigOptimizationLab-7.png
    │   ├── FigOptimizationLab-8.png
    │   ├── application_timeline_after_host_optimiaztion.png
    │   ├── application_timeline_before_host_optimiaztion.png
    │   ├── compute_unit_settings.png
    │   ├── localhost_connected.png
    │   └── zoon_buttons.png
    ├── putty_dcv.png
    ├── putty_ip4.png
    └── rtlkernel_lab
    │   ├── FigRTLKernelLab-10.png
    │   ├── FigRTLKernelLab-11.png
    │   ├── FigRTLKernelLab-12.png
    │   ├── FigRTLKernelLab-13.png
    │   ├── FigRTLKernelLab-14.png
    │   ├── FigRTLKernelLab-15.png
    │   ├── FigRTLKernelLab-16.png
    │   ├── FigRTLKernelLab-17.png
    │   ├── FigRTLKernelLab-18.png
    │   ├── FigRTLKernelLab-19.png
    │   ├── FigRTLKernelLab-20.png
    │   ├── FigRTLKernelLab-21.png
    │   ├── FigRTLKernelLab-22.png
    │   ├── FigRTLKernelLab-4.png
    │   ├── FigRTLKernelLab-5.png
    │   ├── FigRTLKernelLab-6.png
    │   ├── FigRTLKernelLab-7.png
    │   ├── FigRTLKernelLab-8.png
    │   ├── FigRTLKernelLab-9.png
    │   ├── hardware_emulation_application_timeline.png
    │   └── hw_emulation_completed_successfully.png
├── rtl_kernel_wizard_lab.md
├── sdx_introduction.md
├── setup_aws.md
├── setup_local_computer.md
├── setup_nimbix.md
├── setup_sdx.md
├── setup_xup_aws_workshop.md
├── slides
    ├── 01_Course_Intro.pdf
    ├── 02_Intro_to_AWS_EC2_F1.pdf
    ├── 03_SDAccel_Tool_Overview.pdf
    ├── 04_SDAccel_Flows.pdf
    ├── 05_Optimization_Techniques.pdf
    ├── 06_RTL_Kernel_Wizard.pdf
    └── 07_Debugging.pdf
├── solutions
    ├── hello_world
    │   ├── aws
    │   │   └── awsf1_2xlarge_18_3_hello_world_sol.sdx.zip
    │   └── u200
    │   │   ├── u200_nimbix_ubuntu16_04_hello_world_sol.sdx.zip
    │   │   └── u200_rh7_5_hello_world_sol_.sdx.zip
    └── optimization_lab
    │   ├── aws
    │       └── aws_2xlarge_18_3_optimization_lab.sdx.zip
    │   └── u200
    │       ├── u200_nimbix_ubuntu16_04_optimization_lab_sol.sdx.zip
    │       └── u200_rh7_5_optimization_lab_sol.sdx.zip
└── sources
    ├── debug
        ├── aws
        │   └── awsf1_2xlarge_18_3_debug.sdx.zip
        └── u200
        │   ├── u200_nimbix_ubuntu16_04_rtl_kernel.sdx.zip
        │   └── u200_rh7_5_debug.sdx.zip
    ├── helloworld_ocl
        ├── command_line.ipynb
        └── src
        │   ├── krnl_vadd.cl
        │   ├── vadd.cpp
        │   ├── vadd.h
        │   ├── xcl.cpp
        │   └── xcl.h
    └── optimization_lab
        ├── idct.cpp
        └── krnl_idct.cpp


/Creating_AFI.md:
--------------------------------------------------------------------------------
 1 | <table style="width:100%">
 2 |   <tr>
 3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
 4 |   </tr>
 5 |   <tr>
 6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
 7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
 8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
 9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
12 |   </tr>
13 | </table>
14 | 
15 | # Creating an Amazon FPGA Image (AFI)
16 | 
17 | This document guides you through the steps to create an AWS Amazon FPGA Image (AFI) which can be run AWS EC2 F1 instance to verify the deign works in hardware. It assumes that a full system (SDx project) is built which consists of an *host* application (.exe) and an FPGA binary file (.xclbin). 
18 | 
19 | ### Create an AFI
20 | 
21 | To execute the application on F1, the following files are needed:
22 | 
23 | - Host application (exe)
24 | - Amazon FPGA Image (awsxclbin)
25 | 
26 | The awsxclbin is an Amazon specific version of the FPGA binary file (xclbin) produced by the SDx software.
27 | 
28 | The awsxclbin can be created by running the *create\_sdaccel\_afi.sh* script which is included in the aws-fpga Git repository. 
29 | 
30 | The script can be found in the following location in the aws-fpga repository:
31 | 
32 | ```
33 | ./aws-fpga/SDAccel/tools/create_sdaccel_afi.sh
34 | ```
35 | 
36 | Before running the commands below, make sure the SDx setup script has been sourced (the following command assumes the aws-fpga Git repository is cloned to the user home area)
37 | 
38 | ```
39 | source ~/aws-fpga/sdaccel_setup.sh
40 | ```
41 | 
42 | * Create an AFI by running the create\_sdaccel\_afi.sh script and wait for the completion of the AFI creation process
43 | 
44 | ```
45 | $SDAccel/tools/create_sdaccel_afi.sh –xclbin=<filename>.xclbin –s3_bucket=<bucket-name> -s3_dcp_key=<dcp-folder-name> -s3_logs_key=<logs-folder-name>
46 | ```
47 | In the above command, set the *xclbin* <filename>; the Amazon S3 &lt;bucket-name&gt;, &lt;dcp-folder-name&gt;, and &lt;logs-folder-name&gt; with the names you had given when running CLI script.  You can choose any valid folder name for the dcp and logs folder. The Amazon S3 bucket name should match an S3 bucket you have set up. 
48 | 
49 | Learn more about setting up S3 buckets at [https://github.com/aws/aws-fpga/blob/master/SDAccel/docs/Setup_AWS_CLI_and_S3_Bucket.md](https://github.com/aws/aws-fpga/blob/master/SDAccel/docs/Setup_AWS_CLI_and_S3_Bucket.md)  
50 | 
51 | The create\_sdaccel\_afi.sh script does the following:
52 | 
53 | - Starts a background process to create the AFI
54 | - Generates a \_afi\_id.txt which contains the FPGA Image Identifier (or AFI ID) and Global FPGA Image Identifier (or AGFI ID) of the generated AFIs
55 | - Creates the \*.awsxclbin AWS FPGA binary file which is passed to the host application to determine which AFI should be loaded to the FPGA.
56 | - Uploads the xclbin to the AWS cloud for processing.
57 | ## Check the AFI status
58 | 
59 | The AFI will become available after some time in the AWS cloud and can then be used to program the FPGA in an AWS EC2 F1 instance. To check the AFI status, the AFI ID is required. 
60 | 
61 | - In the directory the *create_sdaccel_afi.sh* script was run, enter the following command to find the AFI ID  
62 |    
63 |    ```
64 |    cat *afi_id.txt
65 |    ```
66 | * Enter the **describe-fpga-images** API command to check the status of the AFI generation process:
67 | 
68 | ```
69 |    aws ec2 describe-fpga-images --fpga-image-ids <AFI ID>
70 | ```
71 | * For example, 
72 | 
73 | ```
74 | aws ec2 describe-fpga-images --fpga-image-ids afi-0b9167434a1c74ba9
75 | ```
76 | 
77 | Note: When AFI creation is in progress, the *State* will be pending. When the AFI creation is finished, the output should show *available*:
78 | 
79 | ```
80 |    ...
81 |    "State": {
82 |        "Code": "available"
83 |    },
84 |    
85 |    ...
86 | ```
87 | 
88 | Wait until the AFI becomes available before proceeding to execute on the F1 instance.
89 | 
90 | 


--------------------------------------------------------------------------------
/Optimization_lab.md:
--------------------------------------------------------------------------------
  1 | <table style="width:100%">
  2 |   <tr>
  3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
  4 |   </tr>
  5 |   <tr>
  6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
  7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
  8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
  9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
 10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
 11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
 12 |   </tr>
 13 | </table>
 14 | 
 15 | # Optimization Lab
 16 | 
 17 | ## Introduction
 18 | 
 19 | In this lab you will create an SDx project and analyze the design to optimize the host code and kernel code to improve the performance of the design.
 20 | 
 21 | ## Objectives
 22 | 
 23 | After completing this lab, you will be able to:
 24 | 
 25 | - Analyze the design and read project reports
 26 | - Optimize the kernel code to improve throughput
 27 | - Optimize the host code to improve the data transfer rate
 28 | - Verify the functionality of the design in hardware
 29 | 
 30 | ### Create an SDx Project
 31 | 
 32 | * Start SDx and select the default workspace (or continue with the workspace from the previous lab)
 33 | 
 34 | * Click on the **Create Application Project** 
 35 | 
 36 | * In the _New Project_&#39;s page enter **optimization\_lab** in the _Project name:_ field and click **Next**
 37 | 
 38 | * Select your target platform and click **Next**
 39 | 
 40 | * Select **Empty Application** and click **Finish**
 41 | 
 42 | ### Import the source files into the project
 43 | 
 44 | * In the _Project Explorer_ expand the *optimization_lab* folder if necessary, and right-click on the **src** folder and select **Import…**
 45 | 
 46 | * Select **General &gt; File System**, click **Next**, browse to the source directory at **~/compute_acceleration/sources/optimization\_lab** and click **OK**
 47 | 
 48 | * Select the **idct.cpp** and **krnl\_idct.cpp** files and click **Finish**
 49 | 
 50 | * Expand the **src** folder in the _Project Explorer_ and note the two added files
 51 | 
 52 | ### Add a function as a hardware kernel
 53 | 
 54 | * Click on the _Add Hardware Function_ button icon (![alt tag](./images/Fig-hw_button.png)) in the **Hardware Functions** tab to see functions available for implementation in hardware.
 55 | 
 56 | * Select _krnl\_idct_ function and click **OK**
 57 | 
 58 |     ![](./images/optimization_lab/FigOptimizationLab-6.png)
 59 | 
 60 | * Notice a **binary\_container\_1** folder is created automatically under which the _kml\_idct_ function is added
 61 | 
 62 | ### Analyze the source files
 63 | 
 64 | * From the *Project Explorer* open the **src>krnl\_idct.cpp** file
 65 | 
 66 | * The **Outline** panel should be visible. It displays an outline of the code of the source file that is currently in scope. If you can't see it, go to **Window > Show View > Outline**. 
 67 | 
 68 |     The outline view can be used to navigate the source file. For example, function names are displayed in the outline view, and clicking on a function will jump to the line of code where the function is defined. 
 69 | 
 70 |     ![](./images/optimization_lab/FigOptimizationLab-7.png)
 71 | 
 72 | * In the _Outline_ viewer, click **idct** to look up the function
 73 | 
 74 | The `idct()` function is the core algorithm in the kernel. It is a computationally intensive function that can be highly parallelized on the FPGA, providing significant acceleration over a CPU-based implementation.
 75 | 
 76 | * Review the code
 77 | 
 78 |     - **krnl\_idct** : Top-level function for the hardware kernel. Interface properties for the kernel are specified in this function
 79 |     - **krnl\_idct\_dataflow** : Called by the **krnl\_idct** function and encapsulates the main functions of the kernel
 80 |     - **read\_blocks** : Reads data from global memory data sent by the host application and streams to the *execute* function
 81 |     - **execute** : For each 8x8 block received, calls the **idct** function to perform the actual IDCT computation 
 82 |     - **write\_blocks** : Receives results from the **execute** function and writes them back to global memory for the host application  
 83 |     
 84 | * Open the **idct.cpp** file.  Again, use the _Outline_ viewer to quickly look up and inspect the important functions of the host application:  
 85 |     - **main** : Initializes the test vectors, sets-up OpenCL resources, runs the reference model, runs the hardware kernel, releases the OpenCL resources, and compares the results of the reference IDCT model with the hardware implementation
 86 |     - **runFPGA** : Takes in a vector of inputs and for each 8x8 block calls the hardware accelerated IDCT using the **write** , **run** , **read** , and **finish** helper functions. These function use OpenCL API calls to communicate with the FPGA
 87 |     - **runCPU** : Takes in a vector of inputs and, for each 8x8 block, calls **idctSoft** , a reference implementation of the IDCT
 88 |     - **idctSoft** : Software implementation of the IDCT algorithm, used to check the results from the FPGA
 89 |     - **oclDct** : This class is used to encapsulate the OpenCL runtime calls to interact with the kernel in the FPGA
 90 |     - **aligned\_allocator** , **smalloc** , **load\_file\_to\_memory** : These are small helper functions used during test vector generation and OpenCL setup  
 91 | * Look at the code around line number 580 of the **idct.cpp** file by pressing Ctrl+l (small L) and entering 496. 
 92 |   This section of code is where the OpenCL environment is setup in the host application. It is typical of most SDx application and will look very familiar to developers with prior OpenCL experience. This body of code can often be reused as-is from project to project.  
 93 | 
 94 | To setup the OpenCL environment, the following API calls are made: 
 95 | 
 96 |    - **clGetPlatformIDs** : Queries the system to identify any available OpenCL platforms. It is called twice as it first checks the number of available platforms before extracting the list of platforms
 97 |    - **clGetPlatformInfo** : Get information about the OpenCL platform, such as vendor name and platform name
 98 |    - **clGetDeviceIDs** : Obtain a list of devices available on a platform
 99 |    - **clCreateContext** : Creates an OpenCL context, which manages the runtime objects
100 |    - **clGetDeviceInfo** : Get information about an OpenCL device like the device name
101 |    - **clCreateProgramWithBinary** : Creates a program object for a context, and loads specified binary data into the program object. The actual program is obtained before this call through the `load_file_to_memory()` function
102 |    - **clCreateKernel** : Creates a kernel object
103 |    - **clCreateCommandQueue** : Create a command-queue on a specific device
104 | 
105 | Note: all objects accessed through a **clCreate...** function call should be released before terminating the program by calling a corresponding **clRelease...**  This avoids memory leakage and clears the locks on the device
106 | 
107 | ### Configure the System Port options
108 | 
109 | #### Configure the System Port in the SDx GUI
110 | 
111 | In the *idct.cpp* file, locate lines 286-297. Note that two memory buffers, *mInBuffer* and *mOutBuffer* are being used. The memory buffers will be located in external DRAM. The kernel will have one or more ports connected to the memory bank(s). By default, the compiler will connect all ports to BANK0 or DDR[0]. For Alveo, memory interfaces can be configured from the SDx GUI, or via a "System Port" switch (--sp) that is passed to the XOCC Kernel Linker. For AWS only the "System Port" switches are currently supported. 
112 | 
113 | * For Alveo, in the *Assistant view, right click on Emulation-HW and click **Settings***
114 | * In the *Hardware Function Settings* expand *optimization_lab > Emulation-SW > binary_container_1* and select **krnl_idct**
115 | 
116 | * Under *Compute Unit Settings* expand *krnl_idct* and krnl_idct_1
117 | 
118 | * From the dropdown block under *Memory* select the following:
119 | 
120 |    * block: DDR[0] 
121 |    * q: DDR[0]
122 |    * voutp: DDR[1]
123 | 
124 | ![](./images/optimization_lab/compute_unit_settings.png)
125 | 
126 | Click **Apply and Close**
127 | 
128 | #### Configure the System Port command line switch
129 | 
130 | For AWS, you can set memory interfaces by passing the --sp compiler switch to the XOCC kernel linker: 
131 | 
132 | ```
133 |     --sp <kernel_instance_name>.<interface_name>:<bank name> 
134 | ```
135 | 
136 | Where:
137 | 
138 | * **<kernel_instance_name>** is the instance name of the kernel 
139 | * **<interface_name>** is the name of the memory interface 
140 | 
141 | The interface names can be found in the **Emulation-SW (or Emulation-HW) > binary_container_1 > reports > link >binary_container_1.xclbin.info** log file
142 | 
143 | * In this case the linker switches would be 
144 | 
145 | ```console
146 |    --sp krnl_idct_1.m_axi_gmem0:bank0 
147 |    --sp krnl_idct_1.m_axi_gmem1:bank0 
148 |    --sp krnl_idct_1.m_axi_gmem2:bank1
149 | ```
150 | 
151 | * In the Project Explorer pane, right-click the project **optimization_lab_example** and select the **C/C++ Settings**
152 | 
153 | * Select **C/C++ Build** > **Settings** in the left pane
154 | * Select the **Miscellaneous** under **SDx XOCC Kernel Linker**
155 | 
156 | ![](./images/optimization_lab/FigOptimizationLab-8.png)
157 | 
158 | The switches can be added individually, or all on one line. 
159 | 
160 | ### Build and run software emulation (Emulation-SW)
161 | 
162 | * Make sure the **project.sdx** under _Optimization\_lab_ in the **Project Explorer** tab is selected
163 | * Select **Emulation-SW** as the *Active Build Configuration*
164 | * Build the project (![alt tag](./images/Fig-build.png)) button  
165 | * In the Project Explorer pane, right-click the project **optimization\_lab** and select **Run As** &gt; **Run Configurations…**
166 | * Select the **Arguments** tab
167 | * Click on the **Automatically add binary container(s) to arguments** check box  
168 | This will add **../binary\_container\_1.xclbin**
169 | * Click **Apply** and then click **Run**  
170 |      The application will be run and the output will be displayed in the Console tab
171 | 
172 | ```console
173 | [Console output redirected to file:/home/cmccabe/workspace/optimization_lab/Emulation-SW/optimization_lab-Default.launch.log]
174 | FPGA number of 64*int16_t blocks per transfer: 256
175 | DEVICE: xilinx_u200_xdma_201830_1
176 | Loading Bitstream: ../binary_container_1.xclbin
177 | INFO: Loaded file
178 | Create Kernel: krnl_idct
179 | Create Compute Unit
180 | Setup complete
181 | Running CPU version
182 | Running FPGA version
183 | Runs complete validating results
184 | TEST PASSED
185 | RUN COMPLETE
186 | ```
187 | 
188 | ### Review the software emulation reports
189 | * In the **Assistant** tab, expand **optimization\_lab** &gt; **Emulation-SW** &gt; **optimization\_lab-Default** 
190 | 
191 |   There will be two files generated by the tool after running the software emulation: Profile Summary and Application Timeline
192 | 
193 | ![](./images/optimization_lab/FigOptimizationLab-11.png)
194 | 
195 | * Double-click the **Profile Summary** report and review it  
196 | 
197 | ![](./images/optimization_lab/FigOptimizationLab-12.png)
198 | 
199 | * Click on each of tabs and review the report:  
200 |     - **Top Operations** : Shows all the major top operations of memory transfer between the host and kernel to global memory, and kernel execution. This allows you to identify throughput bottlenecks when transferring data. Efficient transfer of data to the kernel/host allows for faster execution times
201 |     - **Kernels &amp; Compute Units** : Shows the number of times the kernel was executed. Includes the total, minimum, average, and maximum run times. If the design has multiple compute units, it will show each compute unit&#39;s utilization. When accelerating an algorithm, the faster the kernel executes, the higher the throughput which can be achieved. It is best to optimize the kernel to be as fast as it can be with the data it requires
202 |     - **Data Transfers** : This tab has no bearing in software emulation as no actual data transfers are emulated across the host to the platform. In hardware emulation, this shows the throughput and bandwidth of the read/writes to the global memory that the host and kernel share
203 |     - **OpenCL APIs** : Shows all the OpenCL API command executions, how many time each was executed, and how long they take to execute
204 | 
205 | * Double-click the **Application Timeline** report and review it
206 | 
207 |     ![](./images/optimization_lab/FigOptimizationLab-13.png)
208 | 
209 | The **Application Timeline** collects and displays host and device events on a common timeline to help you understand and visualize the overall health and performance of your systems. These events include OpenCL API calls from the host code: when they happen and how long each of them takes.
210 | 
211 | ### Perform HW Emulation      
212 | * Click on the drop-down button of _Active build configuration_ and select **Emulation-HW**
213 | 
214 | * Assign the System Ports as you did in the Emulation-SW mode
215 | 
216 |    * In the *Assistant view* > right click on *Emulation-HW* and click **Settings**
217 | 
218 |    * Expand *optimization_lab > Emulation-HW > binary_container_1* and select **krnl_idct**
219 |    * Under *Compute Unit Settings* expand *krnl_idct* and krnl_idct_1
220 |    * Select the following:
221 | 
222 |        - mInBuffer: DDR[0] 
223 |        - mOutBuffer: DDR[1]
224 |    
225 | * Build the project (![alt tag](./images/Fig-build.png)) 
226 | 
227 | * Select **Run &gt; Run Configurations…** to open the configurations window
228 | 
229 | * In the *Main* tab, click to select **Use waveform for kernel debugging** and **Launch live waveform** 
230 | 
231 |     ![](./images/optimization_lab/FigOptimizationLab-14.png)
232 | 
233 | * Click on the **Arguments** tab and make sure the _binary\_container\_1.xclbin_ is already assigned  
234 | 
235 |   If no argument was assigned click to select **Automatically add binary container(s) to arguments**, and click **Apply**
236 | 
237 | * Click **Run** to run the application  
238 | 
239 | The Console tab shows that the test was completed successfully along with the data transfer rate  
240 | 
241 | ```
242 | [Console output redirected to file:/home/nimbix/workspace/optimization_lab/Emulation-HW/optimization_lab-Default.launch.log]
243 | FPGA number of 64*int16_t blocks per transfer: 256
244 | DEVICE: xilinx_u200_xdma_201830_1
245 | Loading Bitstream: ../binary_container_1.xclbin
246 | INFO: Loaded file
247 | INFO: [SDx-EM 01] Hardware emulation runs simulation underneath. Using a large data set will result in long simulation times. It is recommended that a small dataset is used for faster execution. This flow does not use cycle accurate models and hence the performance data generated is approximate.
248 | Create Kernel: krnl_idct
249 | Create Compute Unit
250 | Setup complete
251 | Running CPU version
252 | Running FPGA version
253 | Runs complete validating results
254 | TEST PASSED
255 | RUN COMPLETE
256 | INFO: [SDx-EM 22] [Wall clock time: 14:53, Emulation time: 0.075225 ms] Data transfer between kernel(s) and global memory(s)
257 | krnl_idct_1:m_axi_gmem-DDR[0]          RD = 128.000 KB             WR = 0.000 KB        
258 | krnl_idct_1:m_axi_gmem1-DDR[0]          RD = 0.500 KB               WR = 0.000 KB        
259 | krnl_idct_1:m_axi_gmem2-DDR[1]          RD = 0.000 KB               WR = 128.000 KB
260 | 
261 | INFO: [SDx-EM 06-0] Waiting for the simulator process to exit
262 | ```
263 | 
264 | Notice that Vivado was started and the simulation waveform window is updated.
265 | 
266 | * Click on the Zoom full button and scroll down the waveform window to see activities taking place in the kernel  
267 | 
268 | Notice that the execution is sequential
269 | 
270 | ![](./images/optimization_lab/FigOptimizationLab-15-1.png)
271 | 
272 | You can close Vivado when you are ready. We will not examine the transactions in detail. 
273 | 
274 | ### Understand the HLS Report, profile summary, and Application Timeline
275 | 
276 | * In the **Assistant** tab, expand **optimization\_lab** &gt; **Emulation-HW** &gt; **optimization\_lab-Default**
277 | 
278 | * Double-click the **Profile Summary** report and review it
279 | 
280 | ![](./images/optimization_lab/FigOptimizationLab-16.png)
281 | 
282 | * Click on the **Kernels &amp; Compute Units** tab of the Profile Summary report
283 | 
284 | * Review the Kernel **Total Time (ms)**  
285 | 
286 |   This number will serve as a baseline (reference point) to compare against after optimization.
287 | 
288 | ![](./images/optimization_lab/FigOptimizationLab-17.png)
289 | 
290 | * In the **Assistant** tab, expand **optimization\_lab** &gt; **Emulation-HW** &gt; **binary\_container\_1** &gt; **krnl\_idct**
291 | 
292 | * Double-click the **HLS Report** and review it
293 | 
294 |     ![](./images/optimization_lab/FigOptimizationLab-18.png)
295 | 
296 | * In the **Performance Estimates** section, expand the **Latency (clock cycles)** &gt; **Summary** and note the following numbers:  
297 | 
298 |     - Latency (min/max): ~6000
299 |     - Interval (min/max): ~6000
300 | 
301 | The numbers may vary slightly depending on the target hardware you selected. 
302 | The numbers will serve as a baseline for comparison against optimized versions of the kernel
303 | 
304 | * In the HLS report, expand **Latency (clock cycles)** &gt; **Detail** &gt; **Instance**
305 | 
306 |     - Note that the 3 sub-functions read, execute and write have roughly the same latency and that their sum total is equivalent to the total Interval reported in the Summary table
307 |     - This indicates that the three sub-functions are executing sequentially, hinting to an optimization opportunity
308 | 
309 |     * Close all the reports
310 | 
311 | ### Analyze the kernel code and apply the DATAFLOW directive
312 | 
313 | * Open the **src > krnl\_idct.cpp** file
314 | 
315 | * Using the **Outline** viewer, navigate to the **krnl\_idct\_dataflow** function  
316 |   Observe that the three functions are communicating using **hls::streams** objects. These objects model a FIFO-based communication scheme. This is the recommended coding style which should be used whenever possible to exhibit streaming behavior and allow **DATAFLOW** optimization
317 | 
318 | * Enable the DATAFLOW optimization by uncommenting the **#pragma HLS DATAFLOW** present in the krnl\_idct\_dataflow function (line 319).
319 | 
320 | The DATAFLOW optimization allows each of the subsequent functions to execute as independent processes. This results in overlapping and pipelined execution of the read, execute and write functions instead of sequential execution. The FIFO channels between the different processes do not need to buffer the complete dataset anymore but can directly stream the data to the next block.
321 | 
322 | * Comment the three **#pragma HLS stream** statements on lines 324, 325 and 326
323 | 
324 | * Save the file
325 | 
326 | ### Build the project in Hardware emulation configuration and analyze the HLS report
327 | 
328 | * Make sure the active configuration is **Emulation-HW**
329 | * Click on the Build button (![alt tag](./images/Fig-build.png)) to build the project
330 | * In the **Assistant** tab, expand **optimization\_lab** &gt; **Emulation-HW** &gt; **binary\_container\_1**  &gt;   **krnl\_idct**
331 | * Double-click the **HLS Report** and review it
332 | 
333 | ()[./images/optimization_lab/FigOptimizationLab-19.png]
334 | 
335 | * In the **Performance Estimates** section, expand the **Latency (clock cycles)** &gt; **Summary** and note the following numbers:
336 |     - Latency (min/max): ~2000
337 |     - Interval (min/max): ~2000
338 | 
339 | ###  Run the Hardware Emulation
340 | 
341 | * Run the application by clicking the Run button (![alt tag](./images/Fig-run.png))  and wait for the run to finish with RUN COMPLETE message 
342 |   Notice the affect of the dataflow optimization in the VIvado simulation waveform view. Execution of reading, writing, pipelining and kernel is not concurrent.
343 | 
344 | ![](./images/optimization_lab/FigOptimizationLab-20-1.png)
345 | 
346 | * In the **Assistant** tab, expand **optimization\_lab > Emulation-HW > optimization\_lab-Default**  and double-click the **Profile Summary** report
347 | 
348 | * Select the **Kernels &amp; Compute Units** tab.  
349 |   Compare the **Kernel Total Time (ms)** with the results from the un-optimized run (numbers may vary slightly to the results displayed below)
350 | 
351 | ![](./images/optimization_lab/FigOptimizationLab-20.png)
352 | 
353 | ### Analyze the host code
354 | 
355 | * Open the **src > idct.cpp** file
356 | 
357 | * Using the **Outline** viewer, navigate to the **runFPGA** function  
358 | 
359 |   For each block of 8x8 values, the **runFPGA** function writes data to the FPGA, runs the kernel, and reads results back. Communication with the FPGA is handled by the OpenCL API calls made within the `cu.write()`, `cu.run()` and `cu.read()` function calls
360 | 
361 |     - `clEnqueueMigrateMemObjects()` schedules the transfer of data to or from the FPGA
362 |     - `clEnqueueTask()` schedules the executing of the kernel 
363 |   These OpenCL functions use events to signal their completion and synchronize execution  
364 | 
365 | * Open the **Application Timeline** of the _Emulation-HW_ run 
366 |   The green segments at the bottom indicate when the IDCT kernel is running
367 |     ![](./images/optimization_lab/application_timeline_before_host_optimiaztion.png)
368 | 
369 | * Notice that there are gaps between each of the green segments indicating that the operations are not overlapping
370 | 
371 | * Zoom in by performing a left mouse drag across one of these gaps to get a more detailed view  
372 |     - The two green segments correspond to two consecutive invocations of the IDCT kernel
373 |     - The gap between the two segments is indicative of the kernel idle time between these two invocations
374 |     - The **Data Transfer** section of the timeline shows that **Read** and **Write** operations are happening when the kernel is idle
375 |     - The Read operation is to retrieve the results from the execution which just finished and the Write operation is to send inputs for the next execution
376 |     - This represents a sequential execution flow of each iteration
377 | 
378 | * Close the **Application Timeline**  
379 | 
380 | * In the **idct.cpp** file, go to the `oclDct::write()` function (line ~260)
381 |     - Notice on line ~274, the function synchronizes on the **outEvVec** event through a call to `clWaitForEvents()`
382 | 
383 |     ```
384 |         clWaitForEvents(1, &outEvVec[mCount]);
385 |     ```
386 | 
387 |     - This event is generated by the completion of the `clEnqueueMigrateMemObjects()` call in the `oclDct::read()` function (line ~360)
388 |     - Effectively the next execution of the `oclDct::write()` function is gated by the completion of the previous `oclDct::read()` function, resulting in the sequential behavior observed in the **Application Timeline**
389 | 
390 | * Use the **Outline** viewer to locate the definition of the **NUM\_SCHED** macro in the **idct.cpp** file
391 |     - This macro defines the depth of the event queue
392 |     - The value of 1 explains the observed behavior: new tasks (write, run, read) are only enqueued when the previous has completed effectively synchronizing each loop iteration
393 |     - By increasing the value of the **NUM\_SCHED** macro, we increase the depth of the event queue and enable more blocks to be enqueued for processing, which may result in the write, run and read tasks to overlap and allow the kernel to execute continuously or at least more frequently
394 |     - This technique is called software pipelining
395 | 
396 | * Modify line 152 to increase the value of **NUM\_SCHED** to 6 as follows  
397 | 
398 | ```
399 |     #define NUM_SCHED 6
400 | ```
401 | 
402 | * Save the file
403 | 
404 | ### Run Hardware Emulation
405 | 
406 | * Change the run configuration by unchecking the **Use waveform for kernel debugging** option, click **Apply**, and then click **Close**
407 | 
408 | * Run the application by clicking the Run button (![](./images/Fig-run.png))  
409 |     - Since only the idct.cpp file was changed, the incremental makefile rebuilds only the host code before running emulation
410 |     - This should be much faster than also recompiling the kernel to hardware 
411 | * In the **Assistant** tab, expand **optimization\_lab > Emulation-HW > optimization\_lab-Default**
412 | * Double-click the **Application Timeline** report  
413 | 
414 | Observe how **software pipelining** enables overlapping of data transfers and kernel execution.
415 | ​
416 | ![](./images/optimization_lab/application_timeline_after_host_optimiaztion.png)
417 | 
418 | Note: system tasks might slow down communication between the application and the hardware simulation, impacting on the performance results. The effect of software pipelining is considerably higher when running on the actual hardware.
419 | 
420 | ###  Run the Application in hardware
421 | 
422 | As before, building the FPGA hardware takes some time, and a precompiled solution is provided. 
423 | 
424 | For AWS, execute the following in a new terminal, as this needs to be run as sudo
425 | 
426 | ```
427 | sudo sh
428 | source /opt/Xilinx/SDx/2018.3.op2405991/settings64.sh
429 | source /opt/xilinx/xrt/setup.sh
430 | export PLATFORM_REPO_PATHS=/home/centos/src/project_data/aws-fpga/SDAccel/aws_platform/xilinx_aws-vu9p-f1-04261818_dynamic_5_0
431 | ```
432 | 
433 | - Start SDx (execute ```sdx``` from the terminal) and from the SDx file menu, select **import**
434 | - Expand *Xilinx* and select **SDx Project** and click **Next**
435 | - Choose *SDx project exported zip* file and click **Next**
436 | 
437 | - Browse to **~/compute_acceleration/solutions/optimization_lab/[aws|u200]/** and select the corresponding ***.sdx.zip** for your OS, and click **OK**
438 | 
439 | You should see a new *optimiaztion_lab* folder in the Project Explorer
440 | 
441 | #### Set the executable file permissions
442 | 
443 | Zip files do not preserve Unix file permissions, so the executable permissions must be modified manually.
444 | 
445 | - Expand *optimiaztion_lab > System* and right click on **optimiaztion_lab.exe**
446 | 
447 | - Select **Execute** for the *Owner* permissions and click **Apply and Close**
448 | 
449 | ![](./images/helloworld/file_permissions.png)
450 | 
451 | #### Disable Auto building
452 | 
453 | - Right click on the project folder, select C/C++ Build Settings
454 | - In C/C++ Build, in the **Behavior** tab make sure the following are unchecked: *Build on resource save (Auto Build)*, *Build (incremental build)* and *Clean* 
455 | - Click **Apply and Close**
456 | 
457 | #### Run the application
458 | 
459 | - Open the project.sdx and select **System** as the *Active build configuration*
460 | 
461 | - In the SDx *Run* menu, select **Run Configurations**
462 | 
463 | - Expand OpenCL if necessary, and select the **optimiaztion_lab-Default** configuration
464 | 
465 | - Check the *binary container* has been included in the *Arguments* tab 
466 | 
467 |   For Alveo, this will be the binary_container_1.**xclbin** file, and for AWS this will be the binary_container_1.**awsxclbin** file. 
468 | 
469 | * On the profiling tab, disable profiling as data transfer in this example will slow down the application (you can rerun with profiling enabled later if you wish)
470 | 
471 | * Click **Run**
472 | 
473 |   
474 | 
475 | The FPGA bitstream will be downloaded and the host application will be executed. 
476 | 
477 | 
478 | ## Conclusion
479 | 
480 | In this lab, you used SDx to create a project and add a kernel (hardware) function. You performed software and hardware emulation, analyzed the design and the various reports generated by the tools. You then optimized the kernel code using the DATAFLOW pragma, and host code by increasing the number of read, write, and run tasks to improve throughput and data transfer rates. You then validated the functionality in hardware. .
481 | 
482 | ---------------------------------------
483 | 
484 | 
485 | Start the next lab: [5. RTL-Kernel Wizard Lab](rtl_kernel_wizard_lab.md)
486 | 
487 | ---------------------------------------
488 | 
489 | 
490 | ## Appendix Build Full Hardware 
491 | 
492 | **Set the build configuration to System and build the system (Note that since the building of the project takes over two hours skip this step in the workshop environment).**  
493 | 
494 | * Either select **Project &gt; Build Configurations &gt; Set Active &gt; System** or click on the drop-down button of _Active build configuration_ and select **System**  
495 | * Set the XOCC Kernel Linker flag as  before
496 | * Either select **Project &gt; Build Project** or click on the (![alt tag](./images/Fig-build.png)) button    
497 | This will build the project under the **System** directory. The built project will include **optimization\_lab.exe** file along with **binary\_container\_1.xclbin** file. This step takes about two hours  
498 | 
499 | ### AWS-F1
500 | 
501 | Once the full system is built, you can <a href="Creating_AFI.md">create an AWS F1 AFI</a>
502 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <table style="width:100%">
 2 |   <tr>
 3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
 4 |   </tr>
 5 |   <tr>
 6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
 7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
 8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
 9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
12 |   </tr>
13 | </table>
14 | 
15 | ### Introduction
16 | 
17 | Welcome to the XUP SDx tutorial. These labs will provide hands-on experience using the SDx software tools with Xilinx FPGA hardware. 
18 | 
19 | The SDx tools can be run on your local machine, or in the cloud (e.g. AWS or Nimbix). 
20 | 
21 | The tutorial instructions target the following hardware and software:
22 | 
23 | * SDx 2018.3
24 | * AWS EC2 F1 f1.2xlarge (cloud)
25 | * Alveo U200 (local board, or cloud)
26 | 
27 | You can build designs locally or in the cloud, and test on hardware locally (if you have an Alveo board) or in the cloud (AWS EC2 F1, Nimbix Alveo U200).
28 | 
29 | You will learn how to develop applications using the SDx development environment that supports OpenCL/C/C++ and RTL kernels.
30 | 
31 | #### Overview of the SDx flow
32 | 
33 | ![alt tag](./images/f1_platform.png)
34 | 
35 | 
36 | 1. SDx is the development environment used to create host applications and hardware accelerators. It includes host CPU and FPGA compilers, and profiling and debugging tools.
37 | 2. The host application can be written in C or C++ and uses the OpenCL API or the XRT (Xilinx Runtime Library) to interact with the accelerated hardware functions running in the FPGA. The accelerated hardware functions (also referred to as kernels) can be written in C, C++, OpenCL or RTL.
38 | 
39 | 
40 | #### Tutorial overview
41 | 
42 |  It is recommended to complete each lab before proceeding to the next.
43 | 
44 | * [**Setup SDx**](setup_sdx.md)
45 | This will show you how to setup SDx on your own machine, or how to use it in the cloud (instructions are provided for AWS and Nimbix clouds).
46 | 
47 | * [**Introduction to SDx**](sdx_introduction.md)
48 | This lab guides you through the steps involved in using a GUI flow to create an SDx project. After creating a project, you will run CPU and hardware emulation to verify the functionality. You will then use an AWS F1 instance to validate the design on F1.
49 | 
50 | * [**Optimization**](Optimization_lab.md)
51 | This lab guides you through the steps involved in creating a project and adding a kernel function. After creating a project, you will run CPU and hardware emulation to verify the functionality, analyze various generated reports and then apply techniques both on host and kernel side to improve throughput and data transfer rate.
52 | 
53 | * [**RTL-Kernel Wizard**](rtl_kernel_wizard_lab.md)
54 | This lab guides you through the steps involved in using a RTL Kernel wizard to wrap a user RTL-based IP so the generated IP can be used in SDx project.
55 | 
56 | * [**Debug**](debug_lab.md)
57 | This lab will show you how to carry out host application debug, and debug of the hardware kernel.
58 | 
59 | * [**SDx command line**](command_line.md)
60 | This lab guides you through the steps involved in using the SDx command line (using a Makefile) to build and perform CPU and hardware emulation of your design. You will then test the design using FPGA hardware. 
61 | 
62 | ---------------------------------------
63 | 
64 | 
65 | When you have setup your instance, go to the first lab [Introduction to SDx](sdx_introduction.md)
66 | 
67 | 


--------------------------------------------------------------------------------
/debug_lab.md:
--------------------------------------------------------------------------------
  1 | <table style="width:100%">
  2 |   <tr>
  3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
  4 |   </tr>
  5 |   <tr>
  6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
  7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
  8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
  9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
 10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
 11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
 12 |   </tr>
 13 | </table>
 14 | 
 15 | # Hardware/Software Debugging
 16 | 
 17 | ## Introduction
 18 | 
 19 | This lab is a continuation of the previous (**<a href="rtl_kernel_wizard_lab.md">RTL-Kernel Wizard Lab</a>**) lab. You will use ChipScope to monitor signals at the kernel interface level and perform software debugging using SDx. Note that this lab is not currently supported on Nimbix as the Xilinx Virtual Cable (XVC is not supported)
 20 | 
 21 | ## Objectives
 22 | 
 23 | After completing this lab, you will be able to:
 24 | 
 25 | * Add ChipScope cores to an SDx design
 26 | * Use ChipScope to monitor signals at the kernel interface
 27 | * Debug a software application in SDx
 28 | 
 29 | ## Steps
 30 | 
 31 | ### Open SDx and import the project
 32 | 
 33 | To save time on compilation, a precompiled project will be provided with the Chipscope debug cores already included in the design. 
 34 | 
 35 | * Open SDx
 36 |     Make sure your target platform has already been imported. You will see an error if the platform used by the precompiled project is not available.
 37 | * From the SDx *File* menu, select **Import**
 38 | * In the *Import Wizard*, expand *Xilinx* and select **SDX Project** and click **Next**
 39 | * Select **SDx project exported zip file** and click **Next**
 40 | * Browse to the appropriate *folder* for your target and click **OK** You should see a .sdx.zip file in the folder you select. 
 41 |       ~/compute_acceleration/sources/debug/<aws|u200>
 42 | * Select the appropriate project archive for your target and click **Next**
 43 | 
 44 | #### Set permissions on imported executable
 45 | 
 46 | * Once the project has been imported, in *Project Explorer* expand **debug>System**
 47 | * Right click on *debug.exe* and select **Properties**
 48 | * Tick the box to add **Execute** to the *Owner* permissions
 49 | 
 50 | ![](./images/debug_lab/rtl_kernel_exe_properties.png)
 51 | 
 52 | 
 53 | 
 54 | * If you don't see an option to set the permissions, open a terminal, browse to the directory containing the debug.exe, and run the following command to change the permissions to make the file executable:
 55 | 
 56 | ```
 57 | chmod 777 debug.exe 
 58 | ```
 59 | 
 60 | ### Hardware Debugging
 61 | 
 62 | #### Review Appendix-I section to understand how to add the ChipScope Debug bridge core and build the project. The debug core has been included in the precompiled sources provided
 63 | 
 64 | #### Run the application
 65 | 
 66 | * Open the project.sdx and select **System** as the Active build configuration.
 67 | 
 68 | * From the **Run** menu, select **Run Configurations**
 69 | * Expand *OpenCL* and select *debug-Default*
 70 | * For Alveo, in the *Arguments* tab make sure **Automatically add binary container(s) to arguments** is selected. For AWS, make sure the *.awsxclbin filename is listed as an argument
 71 | * Click **Run**
 72 | 
 73 | The host application will start executing, load the bitstream, and wait for user input (press any key to continue) 
 74 | 
 75 | ![](./images/debug_lab/FigDebugLab-3.png)
 76 | 
 77 | ### Set up the Xilinx Virtual Cable (XVC)
 78 | 
 79 | The Xilinx Virtual Cable (XVC) is a virtual device that gives you JTAG debug capabilities over PCIe to the target device. XVC will be used to debug the design. 
 80 | 
 81 | #### For Alveo U200
 82 | 
 83 | For an Alveo board, you need to determine the XVC device in your system. XVC is installed as part of the SDx and XRT installation. 
 84 | 
 85 | ```
 86 |       ls /dev/xvc_pub*
 87 | ```
 88 | 
 89 | This will report something similar to the output below:
 90 | 
 91 | ```
 92 |       /dev/xvc_pub.u513
 93 | ```
 94 | 
 95 | Each computer may have a different value for *xvc_pub.\** so you will need to check the name for your computer. 
 96 | 
 97 | * In a terminal window, start a virtual jtag connection 
 98 | 
 99 | Run the following command (where _u513_ should be the value your obtained from the previous command):
100 | 
101 | ```
102 |       sdx_debug_hw --xvc_pcie /dev/xvc_pub.u513 --hw_server
103 | ```
104 | 
105 | ```
106 |       launching xvc_pcie...
107 |       xvc_pcie -d /dev/xvc_pub.u513 -s TCP::10200
108 |       launching hw_server...
109 |       hw_server -sTCP::3121
110 | 
111 |       ****************************
112 |       *** Press Ctrl-C to exit ***
113 |       ****************************
114 | ```
115 | 
116 | The Virtual JTAG XVC Server will start listening to TCP port **10200** in this case. This is the port you will need to connect to from Vivado (below). Note the *hw_server* is listening to TCP port 3121.
117 | 
118 | Skip the next section and continue with [Connecting Vivado to the XVC](#connect_vivado_to_xvc)
119 | 
120 | #### For AWS
121 | For AWS run the following script which will manage setup of the XVC:
122 | 
123 |    ```
124 |       sudo fpga-start-virtual-jtag -P 10200 -S 0
125 |    ```
126 | 
127 | ![](./images/debug_lab/FigDebugLab-4.png)
128 | 
129 | <a name="connect_vivado_to_xvc"></a>
130 | 
131 | ### Connecting Vivado to the XVC
132 | 
133 | * Start Vivado from another terminal
134 | 
135 |    ```
136 |       vivado
137 |    ```
138 | 
139 | * Click on **Open Hardware Manager** link
140 | * Click **Open Target > Autoconnect**
141 | 
142 | ![](./images/debug_lab/hw_manager_open_target.png)
143 | 
144 | * Right click on *localhost (0)* and select **Add Xilinx Virtual Cable (XVC)** 
145 | 
146 |    ![](./images/debug_lab/add_virtual_cable.png)
147 | 
148 | * Enter **localhost** as the *host name*, and **10200** as the port (or the *port number* for your machine obtained previously) and click **OK**
149 | 
150 |    ![](./images/debug_lab/set_virtual_cable_port.png)
151 | 
152 | * Right click on the *debug_bridge* and select **Refresh Device**.
153 | 
154 | The Vivado Hardware Manager should open showing _Hardware_, _Waveform_, _Settings-hw_, _Trigger-Setup_ windows. The _Hardware_ window also shows the detected ILA cores (*hw_ila_\**), inserted in the design. The Alveo design will have one ILA. The AWS design will have two ILAs, one monitoring the AWS shell interface. 
155 | 
156 | ![](./images/debug_lab/FigDebugLab-7.png)
157 | 
158 | 
159 | * Select the *debug_bridge* in the Hardware panel
160 | * In the _Hardware Device Properties_ view, click on the browse button beside **Probes file**
161 | * Browse to the project's **./workspace/debug/System** folder, select the **.ltx** file and click **OK**  
162 | * Select the *hw_ila_1* tab, and notice four (Slot_0 to Slot_3) probes are filled in the Waveform window
163 | * Click on the **Run Trigger immediate** button ![](./images/debug_lab/run_trigger_immediate.png) and observe the waveform window is fills with data showing that the four channels were _Inactive_ for the duration of the signal capture. 
164 | 
165 |    ![](./images/debug_lab/FigDebugLab-8.png)
166 | 
167 | * Expand **slot_1 : KVAdd_1_m01_axi : Interface** , then find and expand  **slot_1 : KVAdd_1_m01_axi : W Channel** in the Waveform window.
168 | * Select the **WVALID** signal and drag it to the Trigger Setup - hw window
169 | 
170 |    ![](./images/debug_lab/FigDebugLab-9.png)
171 | 
172 | * Click on drop-down button of the Value field and select trigger condition value as 1
173 | 
174 |    ![](./images/debug_lab/FigDebugLab-10.png)
175 | 
176 | * Click on the _Run trigger_ button ![](./images/debug_lab/trigger_button.png)and observe the _hw_ila_1_ probe is waiting for the trigger condition to occur
177 | 
178 |    ![](./images/debug_lab/FigDebugLab-11.png)
179 | 
180 | * Switch to the SDx window select the *Console* window and press the **Enter key** to allow the program to continue executing
181 | Observe that the program completes displaying **INFO: Test completed successfully** in the Console window
182 | * Switch back to Vivado and notice that because the trigger condition was met, the waveform window has been populated with new captured data. 
183 | 
184 |    ![](./images/debug_lab/FigDebugLab-12.png)
185 | 
186 | * Expand **Slot_0, slot_1,** and **slot_2** groups, zoom in to the region around samples _450 to 1000_, and observe the data transfers taking place on each channels. Also note the addresses from where data are read and where the results are written to.
187 | 
188 |    ![](./images/debug_lab/FigDebugLab-13.png)
189 | 
190 | * Zoom in on one of the transactions and hover your mouse at each successive sample and notice the data content changing
191 | * When you are finished, close Vivado by selecting **File > Exit**
192 | * Close the jtag probe by switching to its terminal window and pressing _Ctrl-C_
193 | 
194 | ### Perform Software Debugging
195 | 
196 | * Switch to the SDx GUI
197 | 
198 | * From the **Run** menu, select **Debug Configurations**
199 | 
200 | * Make sure that the **Arguments** tab shows **../binary_container_1.xclbin** 
201 | 
202 | * Click **Debug**  
203 | 
204 | * Click **Yes** when prompted to switch to the _Debug perspective_
205 | The bitstream will be downloaded to the FPGA and the host application will start executing, halting at **main()** entry point
206 | 
207 | * In _host_example.cpp_ view scroll down to line ~262 and double-click on the left border to set a breakpoint  At this point, three buffers would have been created
208 | 
209 |   ![](./images/debug_lab/FigDebugLab-14.png)
210 |   
211 | * Click on the **Resume** button or press **F8** 
212 | 
213 | * When prompted click in the console and press *Enter* 
214 |   The program will resume executing and stop when it reaches the breakpoint  
215 |   At this point you can click on the various monitoring tabs (*Variables, Command Queue, Memory Buffers* etc.) and see the contents currently in scope
216 |   SDx debug allows command queues and memory buffers to be examined as the program execution progresses
217 | 
218 | * Click back to select *Debug.exe > #Thread 1* in the Debug panel
219 | 
220 | * Click on the **Step Over** button or press **F6**
221 |   
222 | 
223 | The execution will progress one statement at a time
224 | 
225 | * Continue pressing **F6** until you reach line ~326 at which point kernel will finish executing
226 | 
227 | * Select the **Memory Buffers** tab 
228 | Notice that three buffers are allocated, their IDs, DDR memory address, and sizes
229 | 
230 |     ![](./images/debug_lab/FigDebugLab-15.png)
231 | 
232 | * Select the **Command Queue** tab and notice that there no commands enqueued. 
233 | 
234 | ![](./images/debug_lab/FigDebugLab-16.png)
235 | 
236 | Lines ~326-330 creates commands to read the data and results
237 | 
238 | ```
239 |    err |= clEnqueueReadBuffer( ... );
240 | ```
241 | 
242 | * Press **F6** to execute the first `clEnqueueReadBuffer()` to create a read buffer command for reading operand _d\_A_ 
243 | Notice the Command Queue tab shows one command submitted
244 | 
245 |     ![](./images/debug_lab/FigDebugLab-17.png)
246 | 
247 | * Press **F6** to execute the next `clEnqueueReadBuffer()` for _d\_B_ 
248 | Notice the Command Queue tab shows two commands submitted
249 | 
250 |     ![](./images/debug_lab/FigDebugLab-18.png)
251 | 
252 | * Set a breakpoint at line ~384 (`clReleaseKernel()`) and press **F8** to resume the execution  
253 | Notice that the Command Queue tab still shows entries
254 | * Press **F6** to execute `clReleaseKernel()`
255 | Notice the Memory Buffers tab is empty as all memories are released
256 | * Click **F8** to complete the execution
257 | * Close the SDx program
258 | 
259 | ## Conclusion
260 | 
261 | In this lab, you used the ChipScope Debug bridge and cores to perform hardware debugging. You also performed software debugging using the SDx GUI.
262 | 
263 | ## Appendix-I
264 | 
265 | ### Steps to Add ChipScope Debug core and build the design
266 | 
267 | * In the **Assistant** tab, expand **System > binary_container_1 > KVadd**
268 | * Select **KVAdd**, right-click and select **Settings...**
269 | * In the **Hardware Function Settings** window, click **Refresh**, and then click on the _ChipScope Debug_ option for the _KVAdd_ kernel
270 | 
271 | ![](./images/debug_lab/enable_chipscope.png)
272 | 
273 | * Click **Apply and close**
274 | * In the **Project Explorer** tab, expand **src > sdx_debug > KVAdd** and double-click on the **host_example.cpp** to open it in the editor window
275 | * Around line 240 (after the _clCreateKernel_ section) enter the following lines of code and save the file. This will pause the host software execution after creating kernel but before allocating buffer
276 |    ```
277 |       printf("\nPress ENTER to continue after setting up ILA trigger...");
278 |       getc(stdin);
279 |    ```
280 | 
281 | ![](./images/debug_lab/FigDebugLab-2.png)
282 | 
283 | * Build the design
284 | 
285 | ### Disable automatic rebuilding of the design
286 | 
287 | When you export a project, and re-import it, the file modified dates may change and cause SDx to make the output executable and hardware kernel "out-of-date". This may cause the design to be automatically recompiled when an attempt is made to run the application from the GUI.  
288 | 
289 | * To disable automatic rebuilding, right click on the project folder, and select **C/C++ Build Settings**
290 | 
291 | * Select **C/C++ Build** and click on the **Behavior** tab
292 | 
293 | * Uncheck the following:
294 |    * Build on resource save (Auto Build)
295 |    * Build (Incremental build)
296 |    * Clean
297 | 
298 | When you export a project, and re-import it, these settings stop the bitstream being automatically rebuilt.
299 | 
300 | ![](./images/debug_lab/turn_off_autobuild.png)
301 | 
302 | If you need to rebuild the project, you can re-enable these settings. If you only need to update the host application, you can run the following command in a terminal in the project folder to rebuild the .exe only (where *debug.exe* is the name of the executable): 
303 | 
304 | ```
305 | cd ./workspace/debug/System
306 | make debug.exe
307 | ```
308 | 
309 | ### References
310 | 
311 | [SDx Debug techniques](https://www.xilinx.com/html_docs/xilinx2018_3/sdaccel_doc/dtp1532068222773.html)
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 


--------------------------------------------------------------------------------
/images/Fig-binary_container.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/Fig-binary_container.png


--------------------------------------------------------------------------------
/images/Fig-build.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/Fig-build.png


--------------------------------------------------------------------------------
/images/Fig-hw_button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/Fig-hw_button.png


--------------------------------------------------------------------------------
/images/Fig-refresh.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/Fig-refresh.png


--------------------------------------------------------------------------------
/images/Fig-run.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/Fig-run.png


--------------------------------------------------------------------------------
/images/SDX_IDE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/SDX_IDE.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-1.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-10.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-11.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-12.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-13-1.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-13-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-13-2.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-14.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-15.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-16.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-17.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-2.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-3.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-4-1.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-4-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-4-2.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-5.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-6.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-7.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-8.png


--------------------------------------------------------------------------------
/images/connecting_lab/FigConnectingLab-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/FigConnectingLab-9.png


--------------------------------------------------------------------------------
/images/connecting_lab/nimbix/connect_to_instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/nimbix/connect_to_instance.png


--------------------------------------------------------------------------------
/images/connecting_lab/nimbix/linux_desktop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/nimbix/linux_desktop.png


--------------------------------------------------------------------------------
/images/connecting_lab/nimbix/select_desktop_mode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/nimbix/select_desktop_mode.png


--------------------------------------------------------------------------------
/images/connecting_lab/nimbix/select_instance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/nimbix/select_instance.png


--------------------------------------------------------------------------------
/images/connecting_lab/nimbix/select_instance_config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/connecting_lab/nimbix/select_instance_config.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-10.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-11.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-12.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-13.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-14.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-15.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-16.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-17.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-18.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-2.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-3.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-4.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-5.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-6.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-7.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-8.png


--------------------------------------------------------------------------------
/images/debug_lab/FigDebugLab-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/FigDebugLab-9.png


--------------------------------------------------------------------------------
/images/debug_lab/add_virtual_cable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/add_virtual_cable.png


--------------------------------------------------------------------------------
/images/debug_lab/enable_chipscope.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/enable_chipscope.png


--------------------------------------------------------------------------------
/images/debug_lab/hw_manager_open_target.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/hw_manager_open_target.png


--------------------------------------------------------------------------------
/images/debug_lab/localhost_connected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/localhost_connected.png


--------------------------------------------------------------------------------
/images/debug_lab/rtl_kernel_exe_properties.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/rtl_kernel_exe_properties.png


--------------------------------------------------------------------------------
/images/debug_lab/run_trigger_immediate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/run_trigger_immediate.png


--------------------------------------------------------------------------------
/images/debug_lab/set_virtual_cable_port.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/set_virtual_cable_port.png


--------------------------------------------------------------------------------
/images/debug_lab/trigger_button.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/trigger_button.png


--------------------------------------------------------------------------------
/images/debug_lab/turn_off_autobuild.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/debug_lab/turn_off_autobuild.png


--------------------------------------------------------------------------------
/images/f1_platform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/f1_platform.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-10.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-11.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-13.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-14.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-15-1.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-15-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-15-2.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-15-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-15-3.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-16.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-18.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-19.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-20.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-21-1.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-21.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-22.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-23.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-24.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-25.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-26.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-27.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-28.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-29.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-30.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-31.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-6.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-8.png


--------------------------------------------------------------------------------
/images/helloworld/FigGUIflowLab-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/FigGUIflowLab-9.png


--------------------------------------------------------------------------------
/images/helloworld/add_xclbin_argument.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/add_xclbin_argument.png


--------------------------------------------------------------------------------
/images/helloworld/empty_application_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/empty_application_project.png


--------------------------------------------------------------------------------
/images/helloworld/file_permissions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/file_permissions.png


--------------------------------------------------------------------------------
/images/helloworld/import_from_dir.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/import_from_dir.png


--------------------------------------------------------------------------------
/images/helloworld/import_srcs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/import_srcs.png


--------------------------------------------------------------------------------
/images/helloworld/sdx_hello_world_ide.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/sdx_hello_world_ide.png


--------------------------------------------------------------------------------
/images/helloworld/select_srcs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/select_srcs.png


--------------------------------------------------------------------------------
/images/helloworld/select_u200_platform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/select_u200_platform.png


--------------------------------------------------------------------------------
/images/helloworld/select_vector_add_fn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/select_vector_add_fn.png


--------------------------------------------------------------------------------
/images/helloworld/sys_estimate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/helloworld/sys_estimate.png


--------------------------------------------------------------------------------
/images/makefile_lab/FigMakefileLab-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/makefile_lab/FigMakefileLab-1.png


--------------------------------------------------------------------------------
/images/makefile_lab/FigMakefileLab-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/makefile_lab/FigMakefileLab-2.png


--------------------------------------------------------------------------------
/images/makefile_lab/FigMakefileLab-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/makefile_lab/FigMakefileLab-3.png


--------------------------------------------------------------------------------
/images/makefile_lab/FigMakefileLab-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/makefile_lab/FigMakefileLab-4.png


--------------------------------------------------------------------------------
/images/makefile_lab/FigMakefileLab-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/makefile_lab/FigMakefileLab-5.png


--------------------------------------------------------------------------------
/images/makefile_lab/FigMakefileLab-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/makefile_lab/FigMakefileLab-6.png


--------------------------------------------------------------------------------
/images/makefile_lab/linker_flag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/makefile_lab/linker_flag.png


--------------------------------------------------------------------------------
/images/nice_dcv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/nice_dcv.png


--------------------------------------------------------------------------------
/images/nice_dcv_desktop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/nice_dcv_desktop.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-11.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-12.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-13.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-14.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-15-1.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-15.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-16.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-17.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-18.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-19.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-20-1.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-20.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-21.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-22-1.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-22.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-23.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-5.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-6.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-7.png


--------------------------------------------------------------------------------
/images/optimization_lab/FigOptimizationLab-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/FigOptimizationLab-8.png


--------------------------------------------------------------------------------
/images/optimization_lab/application_timeline_after_host_optimiaztion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/application_timeline_after_host_optimiaztion.png


--------------------------------------------------------------------------------
/images/optimization_lab/application_timeline_before_host_optimiaztion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/application_timeline_before_host_optimiaztion.png


--------------------------------------------------------------------------------
/images/optimization_lab/compute_unit_settings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/compute_unit_settings.png


--------------------------------------------------------------------------------
/images/optimization_lab/localhost_connected.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/localhost_connected.png


--------------------------------------------------------------------------------
/images/optimization_lab/zoon_buttons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/optimization_lab/zoon_buttons.png


--------------------------------------------------------------------------------
/images/putty_dcv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/putty_dcv.png


--------------------------------------------------------------------------------
/images/putty_ip4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/putty_ip4.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-10.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-11.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-12.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-13.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-14.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-15.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-16.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-17.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-18.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-19.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-20.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-21.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-22.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-4.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-5.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-6.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-7.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-8.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/FigRTLKernelLab-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/FigRTLKernelLab-9.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/hardware_emulation_application_timeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/hardware_emulation_application_timeline.png


--------------------------------------------------------------------------------
/images/rtlkernel_lab/hw_emulation_completed_successfully.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/images/rtlkernel_lab/hw_emulation_completed_successfully.png


--------------------------------------------------------------------------------
/rtl_kernel_wizard_lab.md:
--------------------------------------------------------------------------------
  1 | <table style="width:100%">
  2 |   <tr>
  3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
  4 |   </tr>
  5 |   <tr>
  6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
  7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
  8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
  9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
 10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
 11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
 12 |   </tr>
 13 | </table>
 14 | 
 15 | # Using the RTL Kernel Wizard
 16 | 
 17 | ## Introduction
 18 | 
 19 | This lab guides you through the steps involved in using the SDx RTL Kernel wizard. This allows RTL code to be used in an SDx design.
 20 | 
 21 | ## Objectives
 22 | 
 23 | After completing this lab, you will be able to:
 24 | 
 25 | - Understand how to use the RTL Kernel wizard available in SDx 
 26 | - Create a new RTL based IP
 27 | - Add the new IP to an application
 28 | - Verify the functionality of the design in hardware
 29 | 
 30 | ## Steps
 31 | 
 32 | ### Create an SDx Project
 33 | 
 34 | * Start SDx and select the default workspace (or continue with the workspace from the previous lab)
 35 | 
 36 | * Click on the **Create Application Project** link on the _Welcome_ page
 37 | 
 38 | * In the _New Project_&#39;s page enter **rtl\_kernel** in the _Project name:_ field and click **Next**
 39 | 
 40 | * Select your target platform and click **Next**
 41 | 
 42 | * Select **Empty Application** and click **Finish**
 43 | 
 44 | ### Create RTL\_Kernel Project using RTL Kernel Wizard      
 45 | 
 46 | * Make sure the **project.sdx** under _rtl\_kernel\_example_ in the **Project Explorer** tab is selected
 47 | 
 48 | * Select **Xilinx &gt; RTL Kernel Wizard…**  
 49 | 
 50 |     ![](./images/rtlkernel_lab/FigRTLKernelLab-4.png)
 51 | 
 52 | * Click **Next**
 53 | 
 54 | * Change _Kernel_ name to **KVAdd**, (for Kernel Vector Addition), _Kernel vendor_ to **Xilinx** leaving the _Kernel library_ and _Number of clocks_ to the default values
 55 |     <p align="center">
 56 |     <img src ="./images/rtlkernel_lab/FigRTLKernelLab-5.png"/>
 57 |     </p>
 58 |     <p align = "center">
 59 |     <i>Setting general settings including name and number of clocks</i>
 60 |     </p>
 61 | 
 62 | * Click **Next**
 63 | 
 64 | * Leave _Number of scalar kernel input arguments_ set to the default value of **1** and the _Argument type_ as **unit** and click **Next**
 65 |     <p align="center">
 66 |     <img src ="./images/rtlkernel_lab/FigRTLKernelLab-6.png"/>
 67 |     </p>
 68 |     <p align = "center">
 69 |     <i>Selecting number of scalar arguments</i>
 70 |     </p>
 71 | 
 72 | * We will have three arguments to the kernel (2 input and 1 output) which will be passed through Global Memory. Set _Number of AXI master interfaces_ to be **3**
 73 | 
 74 | * Keep the width of each AXI master data width to **64** (note this is specified in bytes so this will give a width of 512 bits for each interface), name **A** as the argument name for *m00\_axi*, **B** for *m01\_axi*, and **Res** for *m02\_axi*
 75 |     <p align="center">
 76 |     <img src ="./images/rtlkernel_lab/FigRTLKernelLab-7.png"/>
 77 |     </p>
 78 |     <p align = "center">
 79 |     <i>Selecting number of AXI master interfaces, their widths, and naming them</i>
 80 |     </p>
 81 | 
 82 | * Click **Next** and the summary page will be displayed showing a function prototype and register map for the kernel.  
 83 | 
 84 |   Note the control register and the scalar operand are accessed via the S\_AXI\_CONTROL interface. The control register is at offset 0x0 and the scalar operand is at offset 0x10.
 85 | 
 86 | ![](./images/rtlkernel_lab/FigRTLKernelLab-8.png)
 87 | 
 88 | * Click **OK** to close the wizard  
 89 | 
 90 |   Notice that a Vivado Project will be created and opened
 91 | 
 92 | ![](./images/rtlkernel_lab/FigRTLKernelLab-9.png)
 93 | 
 94 | ### Analyze the design created by the RTL Kernel wizard
 95 | 
 96 | * Expand the hierarchy of the Design Sources in the Sources window and notice all the design sources, constraint file, and the basic testbench generated by the wizard
 97 | 
 98 | ![](./images/rtlkernel_lab/FigRTLKernelLab-10.png)
 99 | 
100 | There is one module to handle the control signals (ap_start, ap_done, and ap_idle) and three master AXI channels to read source operands from, and write the result to DDR. The expanded m02_axi module shows *adder*, *read*, *write* instances.
101 | * Select **Flow Navigator &gt; RTL ANALYSIS &gt; Open Elaborated Design** which will analyze the design and open a schematic view. Click **OK**
102 | * You should see two top-level blocks: example and control as seen below
103 | 
104 | ![](./images/rtlkernel_lab/FigRTLKernelLab-11.png) 
105 | 
106 | Notice the AXI Master interfaces are 64 bytes (or 512 bits) wide as specified earlier.
107 | 
108 | * Double-click on the example block and observe the three hierarchical Master AXI blocks
109 | 
110 | ![](./images/rtlkernel_lab/FigRTLKernelLab-12.png)
111 | 
112 | * Zoom in into the top section and see the control logic the wizard has generated the ap_start, ap_idle, and ap_done control signals
113 | 
114 | ![](./images/rtlkernel_lab/FigRTLKernelLab-13.png)
115 | 
116 | * Traverse through one of the AXI interface blocks (e.g. m02) and observe that the design consists of a Read Master, Write Master, and an Adder. (Click on the image to download an enlarged version if necessary)
117 | 
118 | ![](./images/rtlkernel_lab/FigRTLKernelLab-14.png)
119 | 
120 | * Close the elaborated view by selecting **File &gt; Close Elaborated Design**
121 | * Click **OK**
122 | 
123 | ### Generate the RTL Kernel
124 | 
125 | * Select **Flow > Generate RTL Kernel** 
126 | * Click **OK** using the default option (Sources-only kernel)  
127 | The packager will be run, generating the xo file which will be used in the design. 
128 | * Click **OK**, and **Yes** to exit Vivado and return to SDx.
129 | 
130 | ### Analyze the RTL kernel added to the SDx project 
131 | 
132 | * Expand the _src_ folder under the **rtl\_kernel\_example**  
133 | Notice that the _sdx\_rtl\_kernel\_wizard_ folder  has been added to the project. Expanding this folder shows the kernel (.xo) and a C++ file have been included.
134 | 
135 | ![](./images/rtlkernel_lab/FigRTLKernelLab-15.png)
136 | 
137 | * Double-click on the **host_example.cpp** to open it.   
138 |   * The _main_ function is defined around line 60. The number of words it transfers is 4096. 
139 |   * Notice around line 94 the source operands and expected results are initialized. 
140 |   * Around line 200 (from the `clCreateProgramWithBinary()` function) shows the loading of the xclbin and creating the OpenCL kernel (`clCreateKernel()`). 
141 |   * The following lines show how the buffers are created in the device memory and enqueued (`clCreateBuffer()`, `clEnqueueWriteBuffer()`). 
142 |   * Around lines 290, the arguments to the kernel are set (`clSetKernelArg()`), and the kernel is enqueued to be executed (`clEnqueueTask()`)).
143 |   * Around line 320 results are read back (`clEnqueueReadBuffer()`) and compared to the expected results. 
144 |   * The _Shutdown and cleanup section_ shows releasing of the memory, program, and kernel.
145 | 
146 | ### Add the design as a hardware kernel, and build the project
147 | 
148 | * Select **project.sdx** in the _Project Explorer_ tab to see the project settings page
149 | 
150 | * Click on the **Add Hardware Function button** (![](./images/Fig-hw_button.png)) and select _KVAdd_
151 | 
152 | * Select **Emulation-HW **on the drop-down button of _Active build configuration_ 
153 | 
154 | ![](./images/rtlkernel_lab/FigRTLKernelLab-17.png)
155 | 
156 | * Select **Project &gt; Build Project** or click on the (![](./images/Fig-build.png)) button  
157 | 
158 |   This will build the project including rtl\_kernel\_example.exe file under the Emulation-HW directory
159 | 
160 | * Select **Run &gt; Run Configurations…** to open the configurations window
161 | 
162 | * Click on the **Arguments** tab select **Automatically add binary container(s) to arguments** 
163 | 
164 | * Click **Apply**, and then click **Run** to run the application
165 | 
166 | ![](./images/rtlkernel_lab/FigRTLKernelLab-19.png)
167 | 
168 | * The Console tab shows that the test was completed successfully along with the data transfer rate
169 | 
170 | ![](./images/rtlkernel_lab/hw_emulation_completed_successfully.png)
171 | 
172 | * In the **Assistant** tab, expand **Emulation-HW > rtl_kernel_example-Default**, and double-click on the **Application Timeline** entry, expand all entries in the timeline graph, zoom appropriately and observe the transactions
173 | 
174 | 
175 | 
176 | 
177 | 
178 | ![](./images/rtlkernel_lab/hardware_emulation_application_timeline.png)
179 | 
180 | This example will be used as the starting point for the next lab, so it is not necessary to test the design in hardware now. 
181 | 
182 | ## Conclusion
183 | 
184 | In this lab, you used the RTL Kernel wizard to create an example RTL adder application. You configured the template and saw the example code that was generated. You performed HW emulation and analyzed the application timeline. 
185 | 
186 | ---------------------------------------
187 | 
188 | Start the next lab: [6. Debug Lab](debug_lab.md)
189 | 


--------------------------------------------------------------------------------
/sdx_introduction.md:
--------------------------------------------------------------------------------
  1 | <table style="width:100%">
  2 |   <tr>
  3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
  4 |   </tr>
  5 |   <tr>
  6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
  7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
  8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
  9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
 10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
 11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
 12 |   </tr>
 13 | </table>
 14 | 
 15 | # SDx
 16 | 
 17 | ## Introduction
 18 | 
 19 | This lab guides you through the steps involved in creating an SDx project. After creating the project you will run software and hardware emulation to verify the functionality of the design. You can also test the design in hardware.
 20 | 
 21 | ### Description of example application
 22 | 
 23 | The source code for the example design will be provided. The design consists of a C++ host application and an OpenCL kernel. The OpenCL kernel is a simple vector addition. The elements of 2 vectors (A & B) will be added together, and the result returned in a third array (C). The host application will initialize the two input arrays, send data to the kernel, and read back the result. The first elements of the arrays will be initialised with 0, or a value passed to the function. The initial value for each subsequent element will be incremented. E.g. If the initialization value is 0, A & B will have the initial values [0,1,2,3 ...]. This will result in the following values returned to C [0,2,4,6 ...]
 24 | 
 25 | You will compile and check a software only version of the application. The *vector add* OpenCL kernel will then be implemented as a hardware kernel. You will first build an emulation version of the design and run a simulation of the hardware kernel. You will then test the application with the kernel running in the FPGA.
 26 | 
 27 | ## Objectives
 28 | 
 29 | After completing this lab, you will be able to:
 30 | 
 31 | - Create an project using the SDx GUI
 32 | - Run Software Emulation to verify the functionality of a design
 33 | - Run Hardware Emulation to verify the functionality of the generated hardware
 34 | - Build the system and test it in hardware 
 35 | - Perform profile and application timeline analysis on the design running in hardware
 36 | 
 37 | 
 38 | ## Steps
 39 | ### Create an SDx Project
 40 | 
 41 | * Launch SDx by executing **sdx** in a terminal window, or click on the **SDX** desktop icon if available.
 42 | You will be prompted to select a workspace directory
 43 | 
 44 | * Click Launch to accept the default (usually ~/workspace)
 45 | The Xilinx SDx IDE window will be displayed.
 46 | 
 47 | ![](./images/SDX_IDE.png)
 48 | 
 49 | * Click on **Create Application Project**
 50 | The _Project Type_ page will be displayed 
 51 | 
 52 | * Enter **hello_world** as the _Project name_ and click **Next**  
 53 | 
 54 | * Select the target platform and click **Next**
 55 | 
 56 | ![](./images/helloworld/select_u200_platform.png)
 57 | 
 58 | * Select **Empty Application** and click **Finish**
 59 | 
 60 | ![](./images/helloworld/empty_application_project.png)
 61 | 
 62 | The SDx *development* view should open:
 63 | 
 64 | ![](./images/helloworld/sdx_hello_world_ide.png)
 65 | 
 66 | There are different *view* settings in SDx, and all perspectives are customizable. Panels can be added or removed, and the layout can be changed. For example, the *Debug* perspective has a different default layout and views. 
 67 | 
 68 | There are six main windows in this perspective: Project Explorer, Main panel (SDx Application Project Settings in the image above) , Assistant, Outline, multi-tab console, and Emulation Console.
 69 | 
 70 | * In the Project explorer, expand the *hello_world* folder if necessary, and *right click* on the **src** folder, and select **import**
 71 | 
 72 | * In the *Import* window, select **General>File System** and click **Next**
 73 | 
 74 |     ![](./images/helloworld/import_srcs.png)
 75 | 
 76 | * Browse to the ~/compute_acceleration/sources/helloworld_ocl/src directory, and click **OK** to select this directory
 77 | 
 78 |     ![](./images/helloworld/import_from_dir.png)
 79 | 
 80 | * Select all five source files **krnl_vadd.cl**, **vadd.cpp**, **vadd.h**, **xcl.cpp**, **xcl.h** and click **Finish**
 81 | 
 82 |     ![](./images/helloworld/select_srcs.png)
 83 | 
 84 | * **krnl_vadd.cl** is the OpenCL source for a simple vector addition kernel
 85 | * **vadd.cpp/.h** are the host application source and header
 86 | * **xcl.cpp/.h** are the source and header for some helpful OpenCL function wrappers
 87 | 
 88 | ### Perform Software Emulation
 89 | 
 90 | * Make sure the *SDx Application Project Settings* are visible in the main panel. If they are not, open **project.sdx** under _hello\_world_ in the **Project Explorer** tab.
 91 | 
 92 | * In the project settings, in the **Hardware Functions** section, click on the _Add Hardware Function_ button icon (![alt tag](./images/Fig-hw_button.png)). 
 93 | 
 94 |     SDx will automatically parse the source files, and will list functions defined in the design here. 
 95 | 
 96 |     This design has only one function `krnl_vadd()` that is a candidate for hardware acceleration.
 97 | 
 98 |     ![](./images/helloworld/select_vector_add_fn.png)
 99 | 
100 | * Select the `krnl_vadd()` function and click **OK** 
101 | 
102 | This will automatically add a *binary container* and include the function inside it. 
103 | 
104 | * Click on the drop-down button of _Active build configuration_ and make sure **Emulation-SW** is selected. Alternatively, this can also be set from the **Project** menu **Build Configurations &gt; Set Active &gt; Emulation-SW** 
105 | 
106 |     ![](./images/helloworld/FigGUIflowLab-6.png)
107 | 
108 | * Click on the build (![alt tag](./images/Fig-build.png)) button or alternatively, select **Project &gt; Build Project**
109 | 
110 | This will build the project and generate hello_world.exe under the *Emulation-SW* directory
111 | 
112 | * From the SDx menu, select **Run &gt; Run Configurations…** to open the configurations window
113 | 
114 | From here, arguments can be passed to the host application. The binary container needs to be passed to the host. 
115 | 
116 | As mentioned in the introduction, the two arrays that will be added together can be initialized by passing a value to the host application. This initialization value can also be set here. 
117 | 
118 | * Click on the **Arguments** tab and select **Automatically add binary container(s)  arguments** 
119 | 
120 | **../binary_container_1.xclbin** should be automatically added to the text area. Add the initialization value for the arrays by typing a number into the box. In this case, type **0** after *../binary_container_1.xclbin* (with a space in between the two arguments).
121 | 
122 | ![](./images/helloworld/FigGUIflowLab-9.png)
123 | 
124 | * Click **Run** to save the configuration and run the application
125 | 
126 | The application can also be run by clicking the Run button (![alt tag](./images/Fig-run.png)). 
127 | 
128 | The application will be run and the output will be displayed in the Console tab
129 | 
130 | You should see a **TEST PASSED** message, preceded by the values of the elements of the arrays, and the result of the addition. 
131 | 
132 | ```
133 |    TEST PASSED
134 | ```
135 | 
136 | You can go back and change the initialization value passed to the host in the *Run Configuration*, and rerun the application to see different numerical results. 
137 | 
138 | 
139 | ### Perform Hardware Emulation
140 | 
141 | The *Software Emulation* flow checks functional correctness of the software application, but it does not guarantee the correctness of the design on the FPGA target. The *Hardware (HW) Emulation* flow can be used to verify the functionality of the generated logic. This flow invokes the hardware simulator in the SDx environment. As a consequence, the HW Emulation flow will take a little longer to build and run than the SW Emulation flow.
142 | 
143 | The Hardware Emulation flow is not cycle accurate, but provides more detailed profiling information than software emulation. It can be used to do some analysis and optimization of the performance of the application.
144 | 
145 | * Click on the drop-down button of _Active build configuration_ and select **Emulation-HW** 
146 | 
147 | ![](./images/helloworld/FigGUIflowLab-8.png)
148 | 
149 | * Click on the (![alt tag](./images/Fig-build.png)) button. 
150 | 
151 | This will build the project including hello\_world.exe file under the Emulation-HW directory
152 | 
153 | * Select **Run &gt; Run Configurations…** to open the configurations window
154 | * Click on the **Arguments** tab and check if _binary\_container\_1.xclbin_ is already assigned. Optionally set an initialization value as before.
155 | 
156 | * Click **Run** to run the application
157 | * The Console tab shows that the test was completed successfully along with the data transfer rate
158 | 
159 | ```console
160 |    TEST PASSED
161 |    INFO: [SDx-EM 22] [Wall clock time: 11:36, Emulation time: 0.0418116 ms]
162 |    Data transfer between kernel(s) and global memory(s)
163 |    krnl_vadd_1:m_axi_gmem-DDR          RD = 32.000 KB              WR = 16.000 KB   
164 | ```
165 | 
166 | ### Review the HLS Report
167 | 
168 | * In the **Assistant** tab, under **Emulation-HW &gt; binary\_container\_1 &gt;  krnl\_vadd** double-click on the **HLS Report**
169 | 
170 |     ![](./images/helloworld/FigGUIflowLab-11.png)
171 | 
172 | The window will open showing the Synthesis report for the **krnl_vadd** accelerator.
173 | 
174 | * Scroll down the window and observe the timing, latency, and loop performance results. 
175 | 
176 | Observe the target clock period. This will be 3.33 (ns) or 4.00 (ns) depending on the target (AWS/Alveo) you are using. You will see how to set the clock later. Check the estimated actual clock period, which should be less than the target, indicating that the timing has been met. 
177 | 
178 | ![](./images/helloworld/FigGUIflowLab-13.png)
179 | 
180 | * Scroll down further and observe the resource utilization by the accelerator (again the numbers may be different to your results)
181 | 
182 | ![](./images/helloworld/FigGUIflowLab-14.png)
183 | 
184 | ###  Review the profile summary report
185 | 
186 | * In the **Assistant** tab under **Emulation-HW &gt; hello\_world-Default** double-click on the **Profile Summary** entry 
187 | 
188 | The numbers in this report will vary depending on the value of the DATA_SIZE constant in vadd.cpp. When you have completed this section, you can go back and change DATA_SIZE, recompile and rerun the application, and check the updated profiling results. 
189 | 
190 | Notice the report window has four tabs: **Top Operations, Kernels and Compute Units,  Data Transfers, OpenCL APIs**. 
191 | 
192 | The *Top Operations* tab summarizes the profiling information for the design. There is only one kernel in this design. Arrays A and B are transferred to memory, then to the kernel, and the result C is written back from the kernel to memory. 
193 | 
194 | The PCIe interface between the host and FPGA is 512 bits, or 64 bytes. This will determine the average bytes per transfer, and the transfer efficiency. 
195 | 
196 | ![](./images/helloworld/FigGUIflowLab-16.png)
197 | 
198 | * Click on the **Kernels &amp; Compute Units** tab and observe the number of Enqueues (1), and the kernel execution time`.
199 | 
200 | * Click on the **Data Transfers** tab.
201 | 
202 |     ​    ![](./images/helloworld/FigGUIflowLab-18.png)
203 | 
204 | * Look at the transfers between *HOST and Global Memory*
205 | 
206 | In the host application, A and B are written from *HOST* memory to *GLOBAL* memory and C is read back to *HOST* memory from *GLOBAL* memory. This is why the *WRITE* data is ~2x the amount of the *READ* data. There is only one transfer as the arrays are transferred as one block of data. 
207 | 
208 | * Look at the transfers between *Kernels and Global Memory*. In this example there is only one kernel.
209 | 
210 | The kernel *READS* A and B, and *WRITES* C. This is why there are 2x the number of read transfers compared to write transfers. There are multiple transfers as each element of the array is read into the design sequentially. 
211 | 
212 | Observe the average size of the data transferred, and the estimated transfer rates. These profile summary will be useful when trying to optimize your own designs. 
213 | 
214 | ### Review the System Estimate report
215 | 
216 | * Double-click on the **System Estimate** entry under the **Emulation-HW > binary_container_1 > krnl_vadd** in the **Assistant** tab
217 |   The report shows the estimated frequency and the resource utilization for the given kernel (krnl\_vadd)
218 | 
219 | ![](./images/helloworld/sys_estimate.png)
220 | 
221 | * Close SDx
222 | 
223 | ### System Build
224 | 
225 | At this stage, you could build the project, but as it takes some time to compile you can skip this step for now. See the Appendix below for instructions on how to build the hardware. 
226 | 
227 | ## Run the precompiled solution (Optional)
228 | 
229 | As building the FPGA hardware takes some time, a precompiled solution is provided. The results should not be any different to the HW and SW emulation, so this step is optional. 
230 | 
231 | For AWS, execute the following in a new terminal, as this needs to be run as sudo
232 | 
233 | ```
234 | sudo sh
235 | source /opt/Xilinx/SDx/2018.3.op2405991/settings64.sh
236 | source /opt/xilinx/xrt/setup.sh
237 | export PLATFORM_REPO_PATHS=/home/centos/src/project_data/aws-fpga/SDAccel/aws_platform/xilinx_aws-vu9p-f1-04261818_dynamic_5_0
238 | ```
239 | 
240 | * Start SDx (execute ```sdx``` from the terminal) and from the SDx file menu, select **import**
241 | * Again from the SDx file menu, select **import**
242 | * Expand *Xilinx* and select **SDx Project** and click **Next**
243 | * Choose *SDx project exported zip file* and click **Next**
244 | * Browse to **~/compute_acceleration/solutions/hello_world/[aws|u200]/hello_world_sol.zip** and click **OK**
245 | 
246 | You should see a new *hello_world_* folder in the Project Explorer
247 | 
248 | #### Set the executable file permissions
249 | 
250 | Zip files do not preserve Unix file permissions, so the executable permissions must be modified manually.
251 | 
252 | * Expand *hello_world_sol > System* and right click on **hello_world_sol.exe**
253 | 
254 | * Select **Execute** for the *Owner* permissions and click **Apply and Close**
255 | 
256 | ![](./images/helloworld/file_permissions.png)
257 | 
258 | #### Disable Auto building
259 | 
260 | * Right click on the project folder, select C/C++ Build Settings
261 | * In C/C++ Build, in the **Behavior** tab make sure the following are unchecked: *Build on resource save (Auto Build)*, *Build (incremental build)* and *Clean* 
262 | * Click **Apply and Close**
263 | 
264 | #### Run the application
265 | 
266 | * Open the project.sdx and select **System** as the *Active build configuration*
267 | * In the SDx *Run* menu, select **Run Configurations**
268 | * Expand OpenCL if necessary, and select the **hello_world-Default** configuration
269 | * Check that the *binary container* has been included in the *Arguments* tab and click **Run**
270 |   * Note for AWS, don't click the box to automatically include the binary container. This will change the argument to ../binary_container_1.xclbin instead of ../binary_container_1.**aws**xclbin binary required for AWS. 
271 | 
272 | You should see the application output in the console. The output should be similar to what you saw for the SW and HW Emulation runs. 
273 | 
274 | ## Conclusion 
275 | 
276 | In this lab, you used SDx to create a project. You ran the design using the software and hardware emulation flows, verified the output, and reviewed the reports. 
277 | 
278 | ---------------------------------------
279 | 
280 | Continue to the [optimization Lab](./Optimization_lab.md)
281 | 
282 | ---------------------------------------
283 | 
284 | ## Appendix: Build Full Hardware
285 | 
286 | Note that building the project can take around two hours. Skip this step in a tutorial environment.
287 | 
288 | * Click on the drop-down button of _Active build configuration_ and select **System** or select **Project &gt; Build Configurations &gt; Set Active &gt; System**
289 | 
290 | * Click on the (![alt tag](./images/Fig-build.png)) button or select **Project &gt; Build Project**
291 | This will build the project under the **System** directory. The built project will include **hello\_world.exe** file along with **binary\_container\_1.xclbin** file. 
292 | 
293 | ### Test on Alveo
294 | 
295 | * Once the project is built, you can click on the *run* button (![alt tag](./images/Fig-run.png)) and verify you see the same results as before. For AWS, see the next step.
296 | 
297 | ### Test on AWS (create AFI)
298 | 
299 | Before the design can be run on AWS an AFI (Amazon FPGA Image) is required.
300 | 
301 | Once the full system is built, you can create an AFI by following the steps listed <a href="Creating_AFI.md">here</a>
302 | 
303 | 


--------------------------------------------------------------------------------
/setup_aws.md:
--------------------------------------------------------------------------------
  1 | <table style="width:100%">
  2 |   <tr>
  3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
  4 |   </tr>
  5 |   <tr>
  6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
  7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
  8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
  9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
 10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
 11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
 12 |   </tr>
 13 | </table>
 14 | 
 15 | # Connecting to AWS
 16 | 
 17 | To get started with AWS, you will need an Amazon account. You will also need AWS credit to run the tutorial. If you are a professor or a student, you may be eligible to free credit by registering with [AWS educate](https://aws.amazon.com/education/awseducate/). 
 18 | 
 19 | 
 20 | 
 21 | ## Set up an AWS instance
 22 | 
 23 | Use the following guide to setup and AWS instance. 
 24 | 
 25 | <https://docs.aws.amazon.com/efs/latest/ug/gs-step-one-create-ec2-resources.html>
 26 | 
 27 | Make sure to use the FPGA Developer AMI. Version 1.6.0 includes the Xilinx SDx 2018.3 tools that this tutorial is based on: <https://aws.amazon.com/marketplace/pp/B06VVYBLZZ>
 28 | 
 29 | ### Login into the AWS and starting an F1 instance
 30 | 
 31 | 1. Once you have an account, log in to the EC2 AWS Console:
 32 | 
 33 |     https://console.aws.amazon.com/ec2
 34 | 
 35 |     This should bring you to the EC2 dashboard (Elastic Compute).
 36 | 
 37 |     In the EC2 dashboard, select Launch Instance. From here you should be able to start your instance. 
 38 | 
 39 | ## Additional setup
 40 | 
 41 | You may want to do some additional setup to allow you to VNC to your instance. You can also follow the instructions in [Setup XUP AWS Workshop](setup_xup_aws_workshop) to connect to your instance. 
 42 | 
 43 | ### VNC server setup
 44 | 
 45 | When setting up an instance for the first time, you need to install vncserver software. 
 46 | 
 47 | #### Install VNC server
 48 | In a terminal, execute the following commands
 49 | 
 50 | ```
 51 |    sudo yum install -y tigervnc-server
 52 |    sudo yum groupinstall -y "Server with GUI"
 53 | ```
 54 | 
 55 | When installing vncserver, you will be prompted to set up a password that you will need later. 
 56 | 
 57 | ### Start vncserver
 58 | 
 59 | Each time you start an instance, you will need to start vncserver 
 60 | 
 61 | 
 62 | ```
 63 |    vncserver -geometry 1920x1080
 64 | ```
 65 | 
 66 | You can choose your preferred geometry (screensize)
 67 | 
 68 | You should see a status message in the terminal once *vncserver* has started. 
 69 | 
 70 | Take note of the number after the “:”
 71 | 
 72 | In this case, 1. This is the port the VNC viewer will connect to on the VNC server and needs to be specified as a two digit number below: 01.
 73 | 
 74 | Connect to AWS instance from VNC viewer
 75 | From VNC viewer, specify the IP address of your AWS instance, followed by the VNC port number (as identified above), in this case :1
 76 | 
 77 | When prompted, enter the VNC server password set up earlier.
 78 | 
 79 | You should then be connected to the AWS instance.
 80 | 
 81 | ### Verify XRT and SDx tools
 82 | 
 83 | Open a terminal and verify the Xilinx SDx tools have been preinstalled and are on the path:
 84 | 
 85 | ```
 86 |    which sdx
 87 | ```
 88 | 
 89 | Note that the XRT tools are installed (/opt/xilinx/xrt) but are not included on the path by default. 
 90 | 
 91 | ```
 92 |    sudo chmod 777 /opt/xilinx/xrt/setup.sh
 93 | ```
 94 | 
 95 | Execute the following to add `source /opt/xilinx/xrt/setup.sh` to ~/.bashrc (or manually edit ~/.bashrc and add the line).
 96 | 
 97 | ```
 98 |    echo "source /opt/xilinx/xrt/setup.sh" >> ~/.bashrc
 99 | ```
100 | 
101 | AWS_FPGA_REPO_DIR is defined in /etc/profile.d/aws-f1.sh 
102 | 
103 | 
104 | ```
105 |    cd ~/src/project_data
106 |    git clone https://github.com/aws/aws-fpga
107 |    cd $AWS_FPGA_REPO_DIR                                         
108 |    source sdaccel_setup.sh
109 |    echo "export PLATFORM_REPO_PATHS=/home/centos/src/project_data/aws-fpga/SDAccel/aws_platform/xilinx_aws-vu9p-f1-04261818_dynamic_5_0" >> ~/.bashrc
110 |     
111 | ```
112 | 
113 | For more details see:
114 | 
115 | https://github.com/aws/aws-fpga/blob/master/SDAccel/README.md
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/setup_local_computer.md:
--------------------------------------------------------------------------------
 1 | <table style="width:100%">
 2 |   <tr>
 3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
 4 |   </tr>
 5 |   <tr>
 6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
 7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
 8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
 9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
12 |   </tr>
13 | </table>
14 | 
15 | # Setup SDx on your own computer
16 | 
17 | To run (or build) these labs on your own computer, install SDAccel and the SDAccel license. For non-commercial/academic use, SDAccel licenses are available from the [Xilinx University Program](www.xilinx.com/university).
18 | 
19 | [Download SDAccel 2018.3](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/sdaccel-development-environment/2018-3.html) and install the tools. 
20 | 
21 | [Download XRT and the U200 package](https://www.xilinx.com/products/boards-and-kits/alveo/u200.html#gettingStarted) for your computer, and install both packages.
22 | 
23 | ### Setup the tools
24 | 
25 | Add the following to your environment setup. 
26 | 
27 | ```csh
28 | source /opt/xilinx/xrt/setup.(c)sh
29 | source $XILINX_SDX/settings64.(c)sh
30 | setenv PLATFORM_REPO_PATHS
31 | ```
32 | 
33 | ```bash
34 | export PLATFORM_REPO_PATHS=$ALVEO_PLATFROM_INSTALLATION_DIRECTORY
35 | ```


--------------------------------------------------------------------------------
/setup_nimbix.md:
--------------------------------------------------------------------------------
 1 | <table style="width:100%">
 2 |   <tr>
 3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
 4 |   </tr>
 5 |   <tr>
 6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
 7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
 8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
 9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
12 |   </tr>
13 | </table>
14 | 
15 | # Connecting to Nimbix
16 | 
17 | * Log in to Nimbix: https://platform.jarvice.com/
18 | 
19 | * Click *Compute* in the top left menu to select a compute instance
20 | 
21 | * Type Xilinx to filter the list of instances
22 | 
23 | ![](./images/connecting_lab/nimbix/select_instance.png)
24 | 
25 | * Select the *Xilinx SDAccel Development* instance
26 | 
27 | * Click on Desktop mode
28 | 
29 | ![](./images/connecting_lab/nimbix/select_desktop_mode.png)
30 | 
31 | * Select the instance you prefer. 
32 | 
33 | The smallest instance can be used for the labs. For the first part of the labs, you don't need to select an instance with Alveo hardware. 
34 | 
35 | ![](./images/connecting_lab/nimbix/select_instance_config.png)
36 | 
37 | When the instance is ready, you will see the option to *Click here to connect*.
38 | 
39 | * Click on the link to connect
40 | 
41 | ![](./images/connecting_lab/nimbix/connect_to_instance.png)
42 | 
43 | A Linux desktop will open in a new tab in your browser.
44 | 
45 | ![](./images/connecting_lab/nimbix/linux_desktop.png)


--------------------------------------------------------------------------------
/setup_sdx.md:
--------------------------------------------------------------------------------
 1 | <table style="width:100%">
 2 |   <tr>
 3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
 4 |   </tr>
 5 |   <tr>
 6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
 7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
 8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
 9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
12 |   </tr>
13 | </table>
14 | 
15 | # Setup SDx
16 | 
17 | There are two main parts to this tutorial - using the [Xilinx SDx software](https://www.xilinx.com/products/design-tools/software-zone/sdaccel.html) and building (compiling) designs, and using and testing those designs in hardware. 
18 | 
19 | You can run this tutorial in different ways. 
20 | 
21 | * If you have an Alveo board, you can run all parts of the tutorial on a local machine. 
22 | 
23 | * You can use the SDx software in the cloud, with hardware in the cloud (AWS or Nimbix).
24 | 
25 | * You can use the SDx software on a local machine for building designs, and only switch to the cloud to test in hardware.
26 | 
27 | This tutorial shows how to use SDx with either AWS EC2 F1 or Alveo U200 (locally, or in the Nimbix cloud). Sources and precompiled and solutions are provided for AWS EC2 F1 x2.large and Alveo U200. You may be able to use the SDx tutorial instructions with other cloud providers, and other hardware. 
28 | 
29 | Once you have decided how you want to run the tutorial, follow the appropriate instructions below. 
30 | 
31 | ## Local computer 
32 | 
33 | To use your own computer, [install and set up SDx and install the Alveo U200 packages](./setup_local_computer.md)
34 | 
35 | ## Use Nimbix (Alveo)
36 | 
37 | The Xilinx SDx tools and Alveo U200 hardware is available in the Nimbix cloud. Use the following instructions to [connect to a Nimbix Alveo instance](./setup_nimbix.md). A [free 100 hr Alveo trial](https://www.nimbix.net/alveo/) is currently available from Nimbix. This is the easiest way to work through this tutorial with Alveo U200 hardware. However, please note the debug lab is not currently supported on Nimbix as the Xilinx Virtual Cable is not available. 
38 | 
39 | ## AWS EC2 F1
40 | 
41 | An [FPGA Developer AMI](https://aws.amazon.com/marketplace/pp/B06VVYBLZZ) (Amazon Machine Image) is available with the Xilinx SDx software preinstalled. This can be used to target AWS EC2 F1 hardware. An AMI is like a Virtual Machine image. You can use this AMI and the following instructions to [set up and connect to an AWS instance](./setup_aws.md)
42 | 
43 | You can also install Xilinx SDx on your local machine, build design offline, and use AWS F1 hardware for testing. See the Amazon guide to using [AWS EC2 FPGA Development Kit](https://github.com/aws/aws-fpga) for details on setting up your machine. 
44 | 
45 | ## XUP AWS Tutorial
46 | 
47 | If you are attending a live instructor-led XUP AWS tutorial, preconfigured AWS F1 instances will be provided for you. Use the following instructions to [connect to your assigned AWS XUP tutorial instance](./setup_xup_aws_workshop.md)
48 | 
49 | # Getting started with the tutorials
50 | 
51 | Once you have setup your computer/cloud instance, you can *git clone* this repository to get started running the tutorial. The repository includes these instructions, and also a copy of source files, and solutions you will need for the tutorial. 
52 | 
53 | The tutorial assumes you will clone this repository to your Linux home area. If you choose to clone it somewhere else, you will need to adjust the path where specified in the tutorial instructions.
54 | 
55 | ```console
56 |    cd ~
57 |    git clone https://github.com/xupgit/compute_acceleration
58 | ```
59 | 
60 | Proceed to the first lab [introducing SDx](sdx_introduction.md) 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/setup_xup_aws_workshop.md:
--------------------------------------------------------------------------------
  1 | <table style="width:100%">
  2 |   <tr>
  3 |     <th width="100%" colspan=6><h2>XUP SDx Labs (2018.3)</h2></th>
  4 |   </tr>
  5 |   <tr>
  6 |     <td align="center"><a href="setup_sdx.md">1. Setup SDx</a></td>
  7 |     <td align="center"><a href="sdx_introduction.md">2. Introduction to SDx</a></td>
  8 |     <td align="center"><a href="Optimization_lab.md">3. Optimization</a></td>
  9 |     <td align="center"><a href="rtl_kernel_wizard_lab.md">4. RTL Kernel Wizard</a></td>
 10 |     <td align="center"><a href="debug_lab.md">5. Debugging</a></td>
 11 |     <td align="center"><a href="sources/helloworld_ocl/command_line.ipynb">6. SDx command line</a></td>
 12 |   </tr>
 13 | </table>
 14 | 
 15 | # Connecting to AWS
 16 | 
 17 | ## Introduction
 18 | 
 19 | The following instructions are for attendees on a live instructor led workshop where an AWS F1 instance has been set up for you, and you have been provided with log-in details. If you are not attending a live workshop, go back to the [Setup SDx](./get_started) page and follow one of the other options to work through these labs.
 20 | 
 21 | This lab will guide you through the steps involved in connecting to a Xilinx workshop AWS EC2 F1 instance, and starting and stopping the instance.
 22 | 
 23 | ## Objectives
 24 | 
 25 | After completing this lab, you will be able to:
 26 | 
 27 | - Connect to an AWS EC2 F1 instance using the provided credentials
 28 | - Start an instance
 29 | - Start a RDP (Remote Desktop Protocol) session
 30 | - Close the RDP session
 31 | - Stop the instance
 32 | 
 33 | ## Steps
 34 | Each registered participant to Xilinx workshop has been allocated a pre-configured EC2 F1 instance and should have received an email with the following details:  
 35 | 
 36 | - Account ID, 
 37 | - IAM username, 
 38 | - Link to access a pre-configured EC2 F1 instance
 39 | 
 40 | ### Login into the AWS and starting an F1 instance
 41 | 
 42 | * Follow the link provided by your instructor, or go to [https://console.aws.amazon.com/ec2](https://console.aws.amazon.com/ec2) to open a login page  
 43 |   If you had used the link then you should see a login page similar to shown here:
 44 | 
 45 | <p align="center">
 46 | <img src ="./images/connecting_lab/FigConnectingLab-1.png"/>
 47 | </p>
 48 | <p align = "center">
 49 | <i>Login page accessed through the provided link</i>
 50 | </p>  
 51 | 
 52 | * Use the log in details provided by your instructor. 
 53 | 
 54 | * In the top right corner, using the drop-down button, select a region with F1 instances, such as **N. Virginia (US East)** or the region indicated by your instructor
 55 | 
 56 | <p align="center">
 57 | <img src ="./images/connecting_lab/FigConnectingLab-3.png"/>
 58 | </p>
 59 | <p align = "center">
 60 | <i>Selecting a region</i>
 61 | </p>  
 62 | If you select the wrong region you may not see your instance.
 63 | 
 64 | * Click on the **EC2** link on the dashboard or if not visible, then click on the _Services_ drop-down button and then click on **EC2**
 65 | 
 66 | <p align="center">
 67 | <img src ="./images/connecting_lab/FigConnectingLab-4-1.png"/>
 68 | <img src ="./images/connecting_lab/FigConnectingLab-4-2.png"/>
 69 | </p>
 70 | <p align = "center">
 71 | <i>Accessing EC2 service</i>
 72 | </p>  
 73 | 
 74 | * Click on the **Instances** link on the left panel
 75 |     <p align="center">
 76 |     <img src ="./images/connecting_lab/FigConnectingLab-5.png"/>
 77 |     </p>
 78 |     <p align = "center">
 79 |     <i>Accessing Instances</i>
 80 |     </p>  
 81 |     You may see several instances
 82 | * Enter your username in the filter field just below the **Launch Instance** button and hit enter
 83 |     <p align="center">
 84 |     <img src ="./images/connecting_lab/FigConnectingLab-6.png"/>
 85 |     </p>
 86 |     <p align = "center">
 87 |     <i>Filtering your instance</i>
 88 |     </p>  
 89 | * Making sure that your instance is selected, click on the **Actions &gt; Instance State &gt; Start**
 90 |     <p align="center">
 91 |     <img src ="./images/connecting_lab/FigConnectingLab-7.png"/>
 92 |     </p>
 93 |     <p align = "center">
 94 |     <i>Starting an instance</i>
 95 |     </p>  
 96 | * Click on the **Yes, Start** button
 97 | * Click on the refresh button(![alt tag](./images/Fig-refresh.png)) to see the updated status to _Running_
 98 |     <p align="center">
 99 |     <img src ="./images/connecting_lab/FigConnectingLab-8.png"/>
100 |     </p>
101 |     <p align = "center">
102 |     <i>Running state</i>
103 |     </p>  
104 | * Make a note of the Public DNS and IPv4 Public IP which will be used by PuTTy and Remote Desktop (RDP)
105 |     <p align="center">
106 |     <img src ="./images/connecting_lab/FigConnectingLab-9.png"/>
107 |     </p>
108 |     <p align = "center">
109 |     <i>Assigned IP to the running instance</i>
110 |     </p>  
111 | 
112 | 
113 | 
114 | ### Connecting to AWS instance using NICE DCV
115 | 
116 | NICE DCV as recommended by Amazon will be used to remote desktop to the instance. 
117 | 
118 | * Download and install the appropriate NICE DCV client if necessary from here: https://download.nice-dcv.com
119 | 
120 | The NICE DCV session has already been started on the instance provided. See the Appendix for details on how to setup a session. 
121 | 
122 | ### Start NICE DCV
123 | 
124 | * Open the NICE DCV application, enter the I*Pv4 Public IP* from the Amazon console and click **Open**
125 | 
126 | ![](./images/nice_dcv.png)
127 | 
128 | * When prompted, enter the username and password provided by your instructor to connect to the instance.
129 | 
130 | ![](./images/nice_dcv_desktop.png)
131 | 
132 | 
133 | 
134 | # Verify XRT and SDx tools
135 | 
136 | - Right-click on the desktop and select **Open Terminal** and verify the Xilinx SDx tools have been preinstalled and are on the path by executing the following command:
137 | 
138 | ```
139 |    which sdx
140 | ```
141 | 
142 | The XRT (Xilinx Run Time) tools are installed (/opt/xilinx/xrt) but are not included on the path by default. 
143 | 
144 | * Execute the following to change the permissions of the XRT setup file, and to automatically source the XRT tools. Make sure to only execute this once. 
145 | 
146 | 
147 | ```
148 |    sudo chmod 774 /opt/xilinx/xrt/setup.sh
149 |    echo "source /opt/xilinx/xrt/setup.sh" >> ~/.bashrc
150 | ```
151 | 
152 | For your reference, in the commands below, $AWS_FPGA_REPO_DIR has already been defined in the environment from: /etc/profile.d/aws-f1.sh 
153 | 
154 | * Exectue the following to clone the *aws-fpga* repository and setup the Xilinx tools. aws-fpga includes the AWS F1 tools, HDK and documentation:
155 | 
156 | ```
157 |    cd ~/src/project_data
158 |    git clone https://github.com/aws/aws-fpga
159 |    cd $AWS_FPGA_REPO_DIR                                         
160 |    source sdaccel_setup.sh
161 |    echo "export PLATFORM_REPO_PATHS=/home/centos/src/project_data/aws-fpga/SDAccel/aws_platform/xilinx_aws-vu9p-f1-04261818_dynamic_5_0" >> ~/.bashrc
162 |     
163 | ```
164 | 
165 | For more details see:
166 | 
167 | https://github.com/aws/aws-fpga/blob/master/SDAccel/README.md
168 | 
169 | ---------------------------------------
170 | 
171 | Return to [Setup SDx](./setup_sdx.md) and go to the **Getting started with the tutorials** section to finish setting up by cloning the tutorial labs. 
172 | 
173 | ---------------------------------------
174 | 
175 | ## Appendix
176 | 
177 | ## Set up the NICE DCV session
178 | 
179 | Open PuTTY, enter the IPv4 Public IP address from the Amazon console, and click open
180 | 
181 | ![](./images/putty_ip4.png)
182 | 
183 | This should open a terminal to the AWS instance. 
184 | 
185 | * In the terminal, enter the following command to start the DCV server:
186 | 
187 | ```
188 | dcv create-session --type virtual --user centos centos
189 | ```
190 | 
191 | ![](./images/putty_dcv.png)
192 | 
193 | * Stop the firewall
194 | 
195 | ```
196 | sudo systemctl disable firewalld
197 | sudo systemctl stop firewalld
198 | ```
199 | 
200 | 
201 | 
202 | ## Interacting with the Instance using Putty
203 | 
204 | * Start **PuTTY** or your preferred SSH client
205 | 
206 | * Enter _centos@&lt;public\_dns\_entry&gt;_ in the **Host Name** field and **22** in the _Port_ field  
207 | Make sure that SSH is selected as the Connection type
208 |     <p align="center">
209 |     <img src ="./images/connecting_lab/FigConnectingLab-15.png"/>
210 |     </p>
211 |     <p align = "center">
212 |     <i>Session settings in PuTTY</i>
213 |     </p>  
214 |   
215 | * Expand **SSH** under the _Connection_ in the left panel and click **Auth**
216 | 
217 | * Click on the **Browse…** button, browse to where the private key has been stored  
218 |   If you don&#39;t have the private key file (as in workshop) you can skip this step
219 | 
220 | * Click **Open**
221 |     <p align="center">
222 |     <img src ="./images/connecting_lab/FigConnectingLab-16.png"/>
223 |     </p>
224 |     <p align = "center">
225 |     <i>Selecting private key file</i>
226 |     </p>  
227 |     
228 | * Click **Yes** 
229 | The PuTTY window will open. It will ask for the password (in case of the workshop). Enter the provided password
230 |   
231 |     <p align="center">
232 |     <img src ="./images/connecting_lab/FigConnectingLab-17.png"/>
233 |     </p>
234 |     <p align = "center">
235 |     <i>The PuTTY window showing the connection</i>
236 |     </p>  
237 |   
238 | * Set a password for the RDP connection with the following command
239 | 
240 |     ```
241 |     sudo passwd <your choice of password> 
242 |     ```
243 | 
244 |     You will use the same password in the RDP connection.
245 | 
246 | * Enter **exit** to close the session
247 | 
248 | 
249 | 
250 | ### Connect using RDP (deprecated for 2018.3)
251 | 
252 | **You can communicate with the instance using command line through PuTTY or Git Bash, and using GUI through remote desktop (RDP) connection.**
253 | 
254 | - Start a remote desktop session
255 | 
256 | - Enter the _IPv4_ address
257 | 
258 | - Click on the **Show Options**
259 | 
260 |   ![](G:/cathalmccabe/awslabs/images/connecting_lab/FigConnectingLab-10.png)
261 | 
262 | - Select the **Display** tab and select _True Color (24 bit)_ and click **Connect**
263 | 
264 |   ![](G:/cathalmccabe/awslabs/images/connecting_lab/FigConnectingLab-11.png)
265 | 
266 | - A certificate warning will be displayed. Click **Yes** to open the RDP session
267 | 
268 | - Enter centos as the username and enter the provided password and click **OK**
269 | 
270 |   ![](./images/connecting_lab/FigConnectingLab-12.png)


--------------------------------------------------------------------------------
/slides/01_Course_Intro.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/slides/01_Course_Intro.pdf


--------------------------------------------------------------------------------
/slides/02_Intro_to_AWS_EC2_F1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/slides/02_Intro_to_AWS_EC2_F1.pdf


--------------------------------------------------------------------------------
/slides/03_SDAccel_Tool_Overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/slides/03_SDAccel_Tool_Overview.pdf


--------------------------------------------------------------------------------
/slides/04_SDAccel_Flows.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/slides/04_SDAccel_Flows.pdf


--------------------------------------------------------------------------------
/slides/05_Optimization_Techniques.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/slides/05_Optimization_Techniques.pdf


--------------------------------------------------------------------------------
/slides/06_RTL_Kernel_Wizard.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/slides/06_RTL_Kernel_Wizard.pdf


--------------------------------------------------------------------------------
/slides/07_Debugging.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/slides/07_Debugging.pdf


--------------------------------------------------------------------------------
/solutions/hello_world/aws/awsf1_2xlarge_18_3_hello_world_sol.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/solutions/hello_world/aws/awsf1_2xlarge_18_3_hello_world_sol.sdx.zip


--------------------------------------------------------------------------------
/solutions/hello_world/u200/u200_nimbix_ubuntu16_04_hello_world_sol.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/solutions/hello_world/u200/u200_nimbix_ubuntu16_04_hello_world_sol.sdx.zip


--------------------------------------------------------------------------------
/solutions/hello_world/u200/u200_rh7_5_hello_world_sol_.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/solutions/hello_world/u200/u200_rh7_5_hello_world_sol_.sdx.zip


--------------------------------------------------------------------------------
/solutions/optimization_lab/aws/aws_2xlarge_18_3_optimization_lab.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/solutions/optimization_lab/aws/aws_2xlarge_18_3_optimization_lab.sdx.zip


--------------------------------------------------------------------------------
/solutions/optimization_lab/u200/u200_nimbix_ubuntu16_04_optimization_lab_sol.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/solutions/optimization_lab/u200/u200_nimbix_ubuntu16_04_optimization_lab_sol.sdx.zip


--------------------------------------------------------------------------------
/solutions/optimization_lab/u200/u200_rh7_5_optimization_lab_sol.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/solutions/optimization_lab/u200/u200_rh7_5_optimization_lab_sol.sdx.zip


--------------------------------------------------------------------------------
/sources/debug/aws/awsf1_2xlarge_18_3_debug.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/sources/debug/aws/awsf1_2xlarge_18_3_debug.sdx.zip


--------------------------------------------------------------------------------
/sources/debug/u200/u200_nimbix_ubuntu16_04_rtl_kernel.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/sources/debug/u200/u200_nimbix_ubuntu16_04_rtl_kernel.sdx.zip


--------------------------------------------------------------------------------
/sources/debug/u200/u200_rh7_5_debug.sdx.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xupgit/compute_acceleration/425ead9a60dd28f48f6b63d0abd8c4cd3d3976e8/sources/debug/u200/u200_rh7_5_debug.sdx.zip


--------------------------------------------------------------------------------
/sources/helloworld_ocl/src/krnl_vadd.cl:
--------------------------------------------------------------------------------
 1 | /*******************************************************************************
 2 | Vendor: Xilinx
 3 | Associated Filename: krnl_vadd.cl
 4 | Purpose: SDx vector addition example
 5 | *******************************************************************************
 6 | Copyright (C) 2017 XILINX, Inc.
 7 | 
 8 | This file contains confidential and proprietary information of Xilinx, Inc. and
 9 | is protected under U.S. and international copyright and other intellectual
10 | property laws.
11 | 
12 | DISCLAIMER
13 | This disclaimer is not a license and does not grant any rights to the materials
14 | distributed herewith. Except as otherwise provided in a valid license issued to
15 | you by Xilinx, and to the maximum extent permitted by applicable law:
16 | (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
17 | HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
18 | INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
19 | FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
20 | in contract or tort, including negligence, or under any other theory of
21 | liability) for any loss or damage of any kind or nature related to, arising under
22 | or in connection with these materials, including for any direct, or any indirect,
23 | special, incidental, or consequential loss or damage (including loss of data,
24 | profits, goodwill, or any type of loss or damage suffered as a result of any
25 | action brought by a third party) even if such damage or loss was reasonably
26 | foreseeable or Xilinx had been advised of the possibility of the same.
27 | 
28 | CRITICAL APPLICATIONS
29 | Xilinx products are not designed or intended to be fail-safe, or for use in any
30 | application requiring fail-safe performance, such as life-support or safety
31 | devices or systems, Class III medical devices, nuclear facilities, applications
32 | related to the deployment of airbags, or any other applications that could lead
33 | to death, personal injury, or severe property or environmental damage
34 | (individually and collectively, "Critical Applications"). Customer assumes the
35 | sole risk and liability of any use of Xilinx products in Critical Applications,
36 | subject only to applicable laws and regulations governing limitations on product
37 | liability.
38 | 
39 | THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
40 | ALL TIMES.
41 | 
42 | *******************************************************************************/
43 | 
44 | //------------------------------------------------------------------------------
45 | //
46 | // kernel:  vadd
47 | //
48 | // Purpose: Demonstrate Vector Add in OpenCL
49 | //
50 | 
51 | #define BUFFER_SIZE 256
52 | kernel __attribute__((reqd_work_group_size(1, 1, 1)))
53 | void krnl_vadd(
54 |                 global const int* a,
55 |                 global const int* b,
56 |                 global int* c,
57 |                 const int n_elements)
58 | {
59 |     int arrayA[BUFFER_SIZE];
60 |     for (int i = 0 ; i < n_elements ; i += BUFFER_SIZE)
61 |     {
62 |         int size = BUFFER_SIZE;
63 |         //boundary check
64 |         if (i + size > n_elements) size = n_elements - i; 
65 | 
66 |         //Burst reading A
67 |         readA: for (int j = 0 ; j < size ; j++) 
68 |             arrayA[j] = a[i+j];
69 | 
70 |         //Burst reading B and calculating C and Burst writing 
71 |         // to  Global memory
72 |         vadd_wrteC: for (int j = 0 ; j < size ; j++) 
73 |             c[i+j] = arrayA[j] + b[i+j];
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/sources/helloworld_ocl/src/vadd.cpp:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | Vendor: Xilinx
  3 | Associated Filename: vadd.cpp
  4 | Purpose: SDAccel vector addition
  5 | 
  6 | *******************************************************************************
  7 | Copyright (C) 2017 XILINX, Inc.
  8 | 
  9 | This file contains confidential and proprietary information of Xilinx, Inc. and
 10 | is protected under U.S. and international copyright and other intellectual
 11 | property laws.
 12 | 
 13 | DISCLAIMER
 14 | This disclaimer is not a license and does not grant any rights to the materials
 15 | distributed herewith. Except as otherwise provided in a valid license issued to
 16 | you by Xilinx, and to the maximum extent permitted by applicable law:
 17 | (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
 18 | HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
 19 | INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
 20 | FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
 21 | in contract or tort, including negligence, or under any other theory of
 22 | liability) for any loss or damage of any kind or nature related to, arising under
 23 | or in connection with these materials, including for any direct, or any indirect,
 24 | special, incidental, or consequential loss or damage (including loss of data,
 25 | profits, goodwill, or any type of loss or damage suffered as a result of any
 26 | action brought by a third party) even if such damage or loss was reasonably
 27 | foreseeable or Xilinx had been advised of the possibility of the same.
 28 | 
 29 | CRITICAL APPLICATIONS
 30 | Xilinx products are not designed or intended to be fail-safe, or for use in any
 31 | application requiring fail-safe performance, such as life-support or safety
 32 | devices or systems, Class III medical devices, nuclear facilities, applications
 33 | related to the deployment of airbags, or any other applications that could lead
 34 | to death, personal injury, or severe property or environmental damage
 35 | (individually and collectively, "Critical Applications"). Customer assumes the
 36 | sole risk and liability of any use of Xilinx products in Critical Applications,
 37 | subject only to applicable laws and regulations governing limitations on product
 38 | liability.
 39 | 
 40 | THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
 41 | ALL TIMES.
 42 | 
 43 | *******************************************************************************/
 44 | #include <stdlib.h>
 45 | #include <errno.h>
 46 | #include <limits.h>
 47 | #include <fstream>
 48 | #include <iostream>
 49 | #include <numeric>
 50 | #include "vadd.h"
 51 | 
 52 | static const int DATA_SIZE = 4096;
 53 | 
 54 | int main(int argc, char* argv[]) {
 55 |     
 56 |     const char *kernel_name = "krnl_vadd"; // Open CL Kernel name
 57 |     int init_value; // Initialization value for vector arrays
 58 |     std::vector<cl::Device> devices; // OpenCL devices
 59 |     cl::Device device;
 60 | 
 61 |     if(argc != 2 and argc != 3) {
 62 |         std::cout << "Usage: " << argv[0] <<"<xclbin> [vector initialization value]\n" << std::endl;
 63 |         return EXIT_FAILURE;
 64 |     }
 65 |     char* xclbinFilename = argv[1];
 66 |     char* p;
 67 | 
 68 |     // Check if vector init value was passed, and convert to int, otherwise default to 0
 69 |     if(argc == 3){
 70 |         errno = 0;
 71 |         long conv = strtol(argv[2], &p, 10);
 72 |         if(errno !=0 || *p != '\0' || conv > INT_MAX){
 73 |             printf("Invalid vector initialization value %s\nValue should be an Integer\nExiting\n", argv[2]);
 74 |         return -1;
 75 |         }else{
 76 |             init_value = conv;
 77 |         }
 78 |     }else{
 79 |         init_value = 0;
 80 |     }
 81 | 
 82 |     // Compute the size of array in bytes
 83 |     size_t size_in_bytes = DATA_SIZE * sizeof(int);
 84 |     
 85 |     // Creates a vector of DATA_SIZE elements 
 86 |     // using customized allocator for getting buffer alignment to 4k boundary
 87 |     std::vector<int,aligned_allocator<int>> source_a(DATA_SIZE);
 88 |     std::vector<int,aligned_allocator<int>> source_b(DATA_SIZE);
 89 |     std::vector<int,aligned_allocator<int>> source_results(DATA_SIZE);
 90 |     
 91 |     // Read in a user defined initial value for the arrays
 92 | 
 93 |     printf("Init arrays\n");
 94 |     // Initialize the arrays
 95 |     std::iota (std::begin(source_a), std::end(source_a), init_value);
 96 |     std::iota (std::begin(source_b), std::end(source_b), init_value);
 97 |     
 98 |     // Check for the Xilinx device on the current platform
 99 |     std::cout << "Get Xilinx platform" << std::endl;
100 |     get_xilinx_platform(&device, &devices);
101 | 
102 |     // Creating Context and Command Queue for selected device
103 |     cl::Context context(device);
104 |     cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE);
105 |     cl::Kernel krnl_vector_add;
106 |     
107 |     krnl_vector_add = load_xcl_bin(kernel_name, xclbinFilename, &context, &devices);
108 |     
109 |     // Allocate memory on the Device. The cl::Buffer objects can
110 |     // be used to reference the memory locations on the device. 
111 |     cl::Buffer buffer_a(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,  
112 |             size_in_bytes, source_a.data());
113 |     cl::Buffer buffer_b(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,  
114 |             size_in_bytes, source_b.data());
115 |     cl::Buffer buffer_result(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, 
116 |             size_in_bytes, source_results.data());
117 |     
118 |     // Data will be transferred from host memory over PCIe to the FPGA on-board
119 |     // DDR memory.
120 |     q.enqueueMigrateMemObjects({buffer_a,buffer_b},0/* 0 means from host*/);
121 | 
122 |     // set the kernel Arguments
123 |     int narg=0;
124 |     krnl_vector_add.setArg(narg++,buffer_a);
125 |     krnl_vector_add.setArg(narg++,buffer_b);
126 |     krnl_vector_add.setArg(narg++,buffer_result);
127 |     krnl_vector_add.setArg(narg++,DATA_SIZE);
128 | 
129 |     /*
130 |        Launch the Kernel
131 |     */
132 |     q.enqueueTask(krnl_vector_add);
133 | 
134 |     // Get the results: Transfer data from FPGA DDR to host memory "source_results"
135 |     q.enqueueMigrateMemObjects({buffer_result},CL_MIGRATE_MEM_OBJECT_HOST);
136 |     q.finish();
137 | 
138 |     // Verify the result
139 |     int match = 0;
140 |     for (int i = 0; i < DATA_SIZE; i++) {
141 |         int host_result = source_a[i] + source_b[i];
142 |         printf(results_message.c_str(), source_a[i], source_b[i], source_results[i]);
143 |         if (source_results[i] != host_result) {
144 |             printf(error_message.c_str(), i, host_result, source_results[i]);
145 |             match = 1;
146 |             break;
147 |         }
148 |     }
149 | 
150 |     std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl; 
151 |     return (match ? EXIT_FAILURE :  EXIT_SUCCESS);
152 | 
153 | }
154 | 


--------------------------------------------------------------------------------
/sources/helloworld_ocl/src/vadd.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | Vendor: Xilinx
  3 | Associated Filename: vadd.h
  4 | Purpose: SDAccel vector addition
  5 | Revision History: January 28, 2016
  6 | 
  7 | *******************************************************************************
  8 | Copyright (C) 2016 XILINX, Inc.
  9 | 
 10 | This file contains confidential and proprietary information of Xilinx, Inc. and
 11 | is protected under U.S. and international copyright and other intellectual
 12 | property laws.
 13 | 
 14 | DISCLAIMER
 15 | This disclaimer is not a license and does not grant any rights to the materials
 16 | distributed herewith. Except as otherwise provided in a valid license issued to
 17 | you by Xilinx, and to the maximum extent permitted by applicable law:
 18 | (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
 19 | HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
 20 | INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
 21 | FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
 22 | in contract or tort, including negligence, or under any other theory of
 23 | liability) for any loss or damage of any kind or nature related to, arising under
 24 | or in connection with these materials, including for any direct, or any indirect,
 25 | special, incidental, or consequential loss or damage (including loss of data,
 26 | profits, goodwill, or any type of loss or damage suffered as a result of any
 27 | action brought by a third party) even if such damage or loss was reasonably
 28 | foreseeable or Xilinx had been advised of the possibility of the same.
 29 | 
 30 | CRITICAL APPLICATIONS
 31 | Xilinx products are not designed or intended to be fail-safe, or for use in any
 32 | application requiring fail-safe performance, such as life-support or safety
 33 | devices or systems, Class III medical devices, nuclear facilities, applications
 34 | related to the deployment of airbags, or any other applications that could lead
 35 | to death, personal injury, or severe property or environmental damage
 36 | (individually and collectively, "Critical Applications"). Customer assumes the
 37 | sole risk and liability of any use of Xilinx products in Critical Applications,
 38 | subject only to applicable laws and regulations governing limitations on product
 39 | liability.
 40 | 
 41 | THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
 42 | ALL TIMES.
 43 | 
 44 | *******************************************************************************/
 45 | 
 46 | #pragma once
 47 | 
 48 | #define CL_HPP_CL_1_2_DEFAULT_BUILD
 49 | #define CL_HPP_TARGET_OPENCL_VERSION 120
 50 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 51 | #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
 52 | 
 53 | #include <CL/cl2.hpp>
 54 | 
 55 | //TARGET_DEVICE macro needs to be passed from gcc command line
 56 | #if defined(SDX_PLATFORM) && !defined(TARGET_DEVICE)
 57 |     #define STR_VALUE(arg)      #arg
 58 |     #define GET_STRING(name) STR_VALUE(name)
 59 |     #define TARGET_DEVICE GET_STRING(SDX_PLATFORM)
 60 | #endif
 61 | 
 62 | static const std::string error_message =
 63 |     "Error: Result mismatch:\n"
 64 |     "i = %d CPU result = %d Device result = %d\n";
 65 | 
 66 | static const std::string results_message =
 67 |     "%d + %d = %d\n";
 68 |     
 69 | //Customized buffer allocation for 4K boundary alignment
 70 | template <typename T>
 71 | struct aligned_allocator
 72 | {
 73 |   using value_type = T;
 74 |   T* allocate(std::size_t num)
 75 |   {
 76 |     void* ptr = nullptr;
 77 |     if (posix_memalign(&ptr,4096,num*sizeof(T)))
 78 |       throw std::bad_alloc();
 79 |     return reinterpret_cast<T*>(ptr);
 80 |   }
 81 |   void deallocate(T* p, std::size_t num)
 82 |   {
 83 |     free(p);
 84 |   }
 85 | };
 86 | 
 87 | 
 88 | int get_xilinx_platform(cl::Device *device, std::vector<cl::Device> *devices){
 89 | 
 90 |     //TARGET_DEVICE macro needs to be passed from gcc command line
 91 |     const char *target_device_name = TARGET_DEVICE;
 92 |     
 93 |     std::vector<cl::Platform> platforms;
 94 |     bool found_device = false;
 95 | //traversing all Platforms To find Xilinx Platform and targeted
 96 |     //Device in Xilinx Platform
 97 |     cl::Platform::get(&platforms);
 98 |     for(size_t i = 0; (i < platforms.size() ) & (found_device == false) ;i++){
 99 |         cl::Platform platform = platforms[i];
100 |         std::string platformName = platform.getInfo<CL_PLATFORM_NAME>();
101 |         if ( platformName == "Xilinx"){
102 |             devices->clear();
103 |             platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, devices);
104 | 
105 |             //Traversing All Devices of Xilinx Platform
106 |             for (size_t j = 0 ; j < devices->size() ; j++){
107 |                 *device = (*devices)[j];
108 |                 std::string deviceName = device->getInfo<CL_DEVICE_NAME>();
109 |                 if (deviceName == target_device_name){
110 |                     found_device = true;
111 |                     std::cout << "Found: " << deviceName << std::endl;
112 |                     break;
113 |                 }
114 |             }
115 |         }
116 |     }
117 |     if (found_device == false){
118 |        std::cout << "Error: Unable to find Target Device "
119 |            << target_device_name << std::endl;
120 |        return EXIT_FAILURE;
121 |     }
122 |     return 0;
123 | }
124 | 
125 | cl::Kernel load_xcl_bin(const char* kernel_name, char* xclbinFilename, cl::Context* context, std::vector<cl::Device> *devices){
126 |     // Load xclbin 
127 |     std::cout << "Loading: '" << xclbinFilename << "'\n";
128 |     std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
129 |     bin_file.seekg (0, bin_file.end);
130 |     unsigned nb = bin_file.tellg();
131 |     bin_file.seekg (0, bin_file.beg);
132 |     char *buf = new char [nb];
133 |     bin_file.read(buf, nb);
134 |     
135 |     // Creating Program from Binary File
136 |     cl::Program::Binaries bins;
137 |     bins.push_back({buf,nb});
138 |     devices->resize(1);
139 |     cl::Program program(*context, *devices, bins);
140 |         // This call will get the kernel object from program. A kernel is an 
141 |     // OpenCL function that is executed on the FPGA. 
142 |     cl::Kernel krnl_vector_add(program, kernel_name);
143 |     return krnl_vector_add;
144 | }
145 | 
146 | 


--------------------------------------------------------------------------------
/sources/helloworld_ocl/src/xcl.cpp:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | Vendor: Xilinx
  3 | Associated Filename: xcl.c
  4 | Purpose: SDAccel histogram equalization example
  5 | Revision History: December 6, 2015
  6 | 
  7 | *******************************************************************************
  8 | Copyright (C) 2015 XILINX, Inc.
  9 | 
 10 | This file contains confidential and proprietary information of Xilinx, Inc. and
 11 | is protected under U.S. and international copyright and other intellectual
 12 | property laws.
 13 | 
 14 | DISCLAIMER
 15 | This disclaimer is not a license and does not grant any rights to the materials
 16 | distributed herewith. Except as otherwise provided in a valid license issued to
 17 | you by Xilinx, and to the maximum extent permitted by applicable law:
 18 | (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
 19 | HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
 20 | INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
 21 | FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
 22 | in contract or tort, including negligence, or under any other theory of
 23 | liability) for any loss or damage of any kind or nature related to, arising under
 24 | or in connection with these materials, including for any direct, or any indirect,
 25 | special, incidental, or consequential loss or damage (including loss of data,
 26 | profits, goodwill, or any type of loss or damage suffered as a result of any
 27 | action brought by a third party) even if such damage or loss was reasonably
 28 | foreseeable or Xilinx had been advised of the possibility of the same.
 29 | 
 30 | CRITICAL APPLICATIONS
 31 | Xilinx products are not designed or intended to be fail-safe, or for use in any
 32 | application requiring fail-safe performance, such as life-support or safety
 33 | devices or systems, Class III medical devices, nuclear facilities, applications
 34 | related to the deployment of airbags, or any other applications that could lead
 35 | to death, personal injury, or severe property or environmental damage
 36 | (individually and collectively, "Critical Applications"). Customer assumes the
 37 | sole risk and liability of any use of Xilinx products in Critical Applications,
 38 | subject only to applicable laws and regulations governing limitations on product
 39 | liability.
 40 | 
 41 | THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
 42 | ALL TIMES.
 43 | 
 44 | *******************************************************************************/
 45 | 
 46 | #include <stdio.h>
 47 | #include <stdlib.h>
 48 | #include <unistd.h>
 49 | 
 50 | #include <string.h>
 51 | #include <math.h>
 52 | 
 53 | #include "xcl.h"
 54 | 
 55 | static void* smalloc(size_t size) {
 56 | 	void* ptr;
 57 | 
 58 | 	ptr = malloc(size);
 59 | 
 60 | 	if (ptr == NULL) {
 61 | 		printf("Error: Cannot allocate memory\n");
 62 | 		printf("Test failed\n");
 63 | 		exit(EXIT_FAILURE);
 64 | 	}
 65 | 
 66 | 	return ptr;
 67 | }
 68 | 
 69 | static int load_file_to_memory(const char *filename, char **result) {
 70 | 	unsigned int size;
 71 | 
 72 | 	FILE *f = fopen(filename, "rb");
 73 | 	if (f == NULL) {
 74 | 		*result = NULL;
 75 | 		printf("Error: Could not read file %s\n", filename);
 76 | 		exit(EXIT_FAILURE);
 77 | 	}
 78 | 
 79 | 	fseek(f, 0, SEEK_END);
 80 | 	size = ftell(f);
 81 | 	fseek(f, 0, SEEK_SET);
 82 | 
 83 | 	*result = (char *) smalloc(sizeof(char)*(size+1));
 84 | 
 85 | 	if (size != fread(*result, sizeof(char), size, f)) {
 86 | 		free(*result);
 87 | 		printf("Error: read of kernel failed\n");
 88 | 		exit(EXIT_FAILURE);
 89 | 	}
 90 | 
 91 | 	fclose(f);
 92 | 	(*result)[size] = 0;
 93 | 
 94 | 	return size;
 95 | }
 96 | 
 97 | xcl_world xcl_world_single(cl_device_type device_type, const char *target_vendor, 
 98 |                            const char *target_device) {
 99 | 	int err;
100 | 	xcl_world world;
101 | 	cl_uint num_platforms;
102 | 
103 | 	err = clGetPlatformIDs(0, NULL, &num_platforms);
104 | 	if (err != CL_SUCCESS) {
105 | 		printf("Error: no platforms available or OpenCL install broken");
106 | 		printf("Test failed\n");
107 | 		exit(EXIT_FAILURE);
108 | 	}
109 | 
110 | 	cl_platform_id *platform_ids = (cl_platform_id *) malloc(sizeof(cl_platform_id) * num_platforms);
111 | 
112 | 	if (platform_ids == NULL) {
113 | 		printf("Error: Out of Memory\n");
114 | 		printf("Test failed\n");
115 | 		exit(EXIT_FAILURE);
116 | 	}
117 | 
118 | 	err = clGetPlatformIDs(num_platforms, platform_ids, NULL);
119 | 	if (err != CL_SUCCESS) {
120 | 		printf("Error: Failed to find an OpenCL platform!\n");
121 | 		printf("Test failed\n");
122 | 		exit(EXIT_FAILURE);
123 | 	}
124 | 
125 | 	int i;
126 |         char cl_platform_vendor[1001];
127 |         //find target vendor if target_vendor is specified
128 |         if (target_vendor != NULL) {
129 |                 for(i = 0; i < num_platforms; i++) {
130 |                         err = clGetPlatformInfo(platform_ids[i], CL_PLATFORM_VENDOR, 1000, (void *)cl_platform_vendor,NULL);
131 |                         if (err != CL_SUCCESS) {
132 |                                 printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n");
133 |                                 printf("Test failed\n");
134 |                                 exit(EXIT_FAILURE);
135 |                         }
136 |                         if ((target_vendor != NULL) && (strcmp(cl_platform_vendor, target_vendor) == 0)) {
137 |                                 printf("INFO: Selected platform %d from %s\n", i, cl_platform_vendor);
138 |                                 world.platform_id = platform_ids[i];
139 |                                 break;
140 |                         }
141 |                 }
142 |         } else {
143 |                 for(i = 0; i < num_platforms; i++) {
144 |                         err = clGetDeviceIDs(platform_ids[i], device_type,
145 |                                              1, &world.device_id, NULL);
146 |                         if (err == CL_SUCCESS) {
147 |                                 world.platform_id = platform_ids[i];
148 |                                 break;
149 |                         }
150 |                 }            
151 |         }
152 | 	free(platform_ids);
153 | 	if (i == num_platforms) {
154 | 		printf("Error: Failed to find a platform\n");
155 | 		printf("Test failed\n");
156 | 		exit(EXIT_FAILURE);
157 | 	}
158 | 
159 |         if (target_device != NULL) {
160 |                 //find target device
161 |                 cl_device_id devices[16];  // compute device id 
162 |                 cl_uint num_devices;
163 |                 char cl_device_name[100];
164 |                 err = clGetDeviceIDs(world.platform_id, CL_DEVICE_TYPE_ACCELERATOR,
165 |                                      16, devices, &num_devices);
166 |                 if (err != CL_SUCCESS) {
167 |                         printf("Error: Failed to create a device group!\n");
168 |                         printf("Test failed\n");
169 |                         exit(EXIT_FAILURE);
170 |                 }
171 | 
172 |                 //iterate all devices to select the target device. 
173 |                 for (i=0; i<num_devices; i++) {
174 |                         err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 100, cl_device_name, 0);
175 |                         if (err != CL_SUCCESS) {
176 |                                 printf("Error: Failed to get device name for device %d!\n", i);
177 |                                 printf("Test failed\n");
178 |                                 exit(EXIT_FAILURE);
179 |                         }
180 |                         //printf("CL_DEVICE_NAME %s\n", cl_device_name);
181 |                         if (strcmp(cl_device_name, target_device) == 0) {
182 |                                 world.device_id = devices[i];
183 |                                 printf("INFO: Selected %s as the target device\n", cl_device_name);
184 |                                 break;
185 |                         }
186 |                 }
187 | 
188 |                 if (i == num_devices) {
189 |                         printf("Error: Failed to find target device %s\n", target_device);
190 |                         printf("Test failed\n");
191 |                         exit(EXIT_FAILURE);
192 |                 }
193 |         }
194 | 
195 | 	world.context = clCreateContext(0, 1, &world.device_id,
196 | 	                                NULL, NULL, &err);
197 | 	if (err != CL_SUCCESS) {
198 | 		printf("Error: Failed to create a compute context!\n");
199 | 		printf("Test failed\n");
200 | 		exit(EXIT_FAILURE);
201 | 	}
202 | 
203 | 	world.command_queue = clCreateCommandQueue(world.context,
204 | 	                                           world.device_id,
205 | 	                                           CL_QUEUE_PROFILING_ENABLE,
206 | 	                                           &err);
207 | 	if (err != CL_SUCCESS) {
208 | 		printf("Error: Failed to create a command queue!\n");
209 | 		printf("Test failed\n");
210 | 		exit(EXIT_FAILURE);
211 | 	}
212 | 
213 | 	return world;
214 | }
215 | 
216 | void xcl_release_world(xcl_world world) {
217 | 	clReleaseCommandQueue(world.command_queue);
218 | 	clReleaseContext(world.context);
219 | }
220 | 
221 | cl_kernel xcl_import_binary(xcl_world world,
222 |                             const char *krnl_file,
223 |                             const char *krnl_name)
224 | {
225 | 	int err;
226 | 
227 | 	char *krnl_bin;
228 | 	const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin);
229 | 
230 | 	cl_program program = clCreateProgramWithBinary(world.context, 1,
231 | 	                                    &world.device_id, &krnl_size,
232 | 	                                    (const unsigned char**) &krnl_bin,
233 | 	                                    NULL, &err);
234 | 	if ((!program) || (err!=CL_SUCCESS)) {
235 | 		printf("Error: Failed to create compute program from binary %d!\n",
236 | 		       err);
237 | 		printf("Test failed\n");
238 | 		exit(EXIT_FAILURE);
239 | 	}
240 | 
241 | 	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
242 | 	if (err != CL_SUCCESS) {
243 | 		size_t len;
244 | 		char buffer[2048];
245 | 
246 | 		printf("Error: Failed to build program executable!\n");
247 | 		clGetProgramBuildInfo(program, world.device_id, CL_PROGRAM_BUILD_LOG,
248 | 		                      sizeof(buffer), buffer, &len);
249 | 		printf("%s\n", buffer);
250 | 		printf("Test failed\n");
251 | 		exit(EXIT_FAILURE);
252 | 	}
253 | 
254 | 	cl_kernel kernel = clCreateKernel(program, krnl_name, &err);
255 | 	if (!kernel || err != CL_SUCCESS) {
256 | 		printf("Error: Failed to create kernel for %s: %d\n", krnl_name, err);
257 | 		printf("Test failed\n");
258 | 		exit(EXIT_FAILURE);
259 | 	}
260 | 
261 | 	/* if program is released, then EnqueueNDRangeKernel fails with
262 | 	 * INVALID_KERNEL */
263 | 	/* clReleaseProgram(program); */
264 | 	free(krnl_bin);
265 | 
266 | 	return kernel;
267 | }
268 | 
269 | cl_kernel xcl_import_source(xcl_world world,
270 |                             const char *krnl_file,
271 |                             const char *krnl_name)
272 | {
273 | 	int err;
274 | 
275 | 	char *krnl_bin;
276 | 	load_file_to_memory(krnl_file, &krnl_bin);
277 | 
278 | 	cl_program program = clCreateProgramWithSource(world.context, 1,
279 | 	                                               (const char**) &krnl_bin,
280 | 	                                               0, &err);
281 | 	if ((err!=CL_SUCCESS) || (!program))  {
282 | 		printf("Error: Failed to create compute program from binary %d!\n",
283 | 		       err);
284 | 		printf("Test failed\n");
285 | 		exit(EXIT_FAILURE);
286 | 	}
287 | 
288 | 	err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
289 | 	if (err != CL_SUCCESS) {
290 | 		size_t len;
291 | 		char buffer[2048];
292 | 
293 | 		printf("Error: Failed to build program executable!\n");
294 | 		clGetProgramBuildInfo(program, world.device_id, CL_PROGRAM_BUILD_LOG,
295 | 		                      sizeof(buffer), buffer, &len);
296 | 		printf("%s\n", buffer);
297 | 		printf("Test failed\n");
298 | 		exit(EXIT_FAILURE);
299 | 	}
300 | 
301 | 	cl_kernel kernel = clCreateKernel(program, krnl_name, &err);
302 | 	if (!kernel || err != CL_SUCCESS) {
303 | 		printf("Error: Failed to create kernel for %s: %d\n", krnl_name, err);
304 | 		printf("Test failed\n");
305 | 		exit(EXIT_FAILURE);
306 | 	}
307 | 
308 | 	/* if program is released, then EnqueueNDRangeKernel fails with
309 | 	 * INVALID_KERNEL */
310 | 	/* clReleaseProgram(program); */
311 | 	free(krnl_bin);
312 | 
313 | 	return kernel;
314 | }
315 | 
316 | void xcl_set_kernel_arg(cl_kernel krnl, cl_uint num, size_t size, const void *ptr) {
317 | 	int err = clSetKernelArg(krnl, num, size, ptr);
318 | 
319 | 	if(err != CL_SUCCESS) {
320 | 		printf("Error: Failed to set kernel arg\n");
321 | 		printf("Test failed\n");
322 | 		exit(EXIT_FAILURE);
323 | 	}
324 | }
325 | 
326 | cl_mem xcl_malloc(xcl_world world, cl_mem_flags flags, size_t size) {
327 | 	cl_mem mem = clCreateBuffer(world.context, flags, size, NULL, NULL);
328 | 
329 | 	if (!mem) {
330 | 		printf("Error: Failed to allocate device memory!\n");
331 | 		printf("Test failed\n");
332 | 		exit(EXIT_FAILURE);
333 | 	}
334 | 
335 | 	return mem;
336 | }
337 | 
338 | void xcl_memcpy_to_device(xcl_world world, cl_mem dest, void* src,
339 |                           size_t size) {
340 | 	int err = clEnqueueWriteBuffer(world.command_queue, dest, CL_TRUE, 0, size,
341 | 	                               src, 0, NULL, NULL);
342 | 	if (err != CL_SUCCESS) {
343 | 		printf("Error: Failed to write to source array a!\n");
344 | 		printf("Test failed\n");
345 | 		exit(EXIT_FAILURE);
346 | 	}
347 | }
348 | 
349 | void xcl_memcpy_from_device(xcl_world world, void* dest, cl_mem src,
350 |                             size_t size) {
351 | 	int err = clEnqueueReadBuffer(world.command_queue, src, CL_TRUE, 0, size,
352 | 	                              dest, 0, NULL, NULL);
353 | 	if (err != CL_SUCCESS) {
354 | 		printf("Error: Failed to read output array! %d\n", err);
355 | 		printf("Test failed\n");
356 | 		exit(EXIT_FAILURE);
357 | 	}
358 | }
359 | 
360 | unsigned long xcl_run_kernel3d(xcl_world world, cl_kernel krnl,
361 |                                size_t x, size_t y, size_t z) {
362 | 	size_t size[3] = {x, y, z};
363 | 	cl_event event;
364 | 	unsigned long start, stop;
365 | 
366 | 	int err = clEnqueueNDRangeKernel(world.command_queue, krnl, 3,
367 | 	                                 NULL, size, size, 0, NULL, &event);
368 | 	if( err != CL_SUCCESS) {
369 | 		printf("Error: failed to execute kernel! %d\n", err);
370 | 		printf("Test failed\n");
371 | 		exit(EXIT_FAILURE);
372 | 	}
373 | 
374 | 	clFinish(world.command_queue);
375 | 
376 | 	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
377 | 	                        sizeof(unsigned long), &start, NULL);
378 | 	clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
379 | 	                        sizeof(unsigned long), &stop, NULL);
380 | 
381 | 	return stop - start;
382 | }
383 | 


--------------------------------------------------------------------------------
/sources/helloworld_ocl/src/xcl.h:
--------------------------------------------------------------------------------
  1 | /*******************************************************************************
  2 | Vendor: Xilinx
  3 | Associated Filename: xcl.h
  4 | Purpose: SDAccel histogram equalization example
  5 | Revision History: December 6, 2015
  6 | 
  7 | *******************************************************************************
  8 | Copyright (C) 2015 XILINX, Inc.
  9 | 
 10 | This file contains confidential and proprietary information of Xilinx, Inc. and
 11 | is protected under U.S. and international copyright and other intellectual
 12 | property laws.
 13 | 
 14 | DISCLAIMER
 15 | This disclaimer is not a license and does not grant any rights to the materials
 16 | distributed herewith. Except as otherwise provided in a valid license issued to
 17 | you by Xilinx, and to the maximum extent permitted by applicable law:
 18 | (1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
 19 | HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
 20 | INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
 21 | FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
 22 | in contract or tort, including negligence, or under any other theory of
 23 | liability) for any loss or damage of any kind or nature related to, arising under
 24 | or in connection with these materials, including for any direct, or any indirect,
 25 | special, incidental, or consequential loss or damage (including loss of data,
 26 | profits, goodwill, or any type of loss or damage suffered as a result of any
 27 | action brought by a third party) even if such damage or loss was reasonably
 28 | foreseeable or Xilinx had been advised of the possibility of the same.
 29 | 
 30 | CRITICAL APPLICATIONS
 31 | Xilinx products are not designed or intended to be fail-safe, or for use in any
 32 | application requiring fail-safe performance, such as life-support or safety
 33 | devices or systems, Class III medical devices, nuclear facilities, applications
 34 | related to the deployment of airbags, or any other applications that could lead
 35 | to death, personal injury, or severe property or environmental damage
 36 | (individually and collectively, "Critical Applications"). Customer assumes the
 37 | sole risk and liability of any use of Xilinx products in Critical Applications,
 38 | subject only to applicable laws and regulations governing limitations on product
 39 | liability.
 40 | 
 41 | THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
 42 | ALL TIMES.
 43 | 
 44 | *******************************************************************************/
 45 | 
 46 | 
 47 | #pragma once
 48 | 
 49 | #include "xcl.h"
 50 | #include <CL/opencl.h>
 51 | 
 52 | typedef struct {
 53 | 	cl_platform_id platform_id;
 54 | 	cl_device_id device_id;
 55 | 	cl_context context;
 56 | 	cl_command_queue command_queue;
 57 | } xcl_world;
 58 | 
 59 | /* xcl_world_single
 60 |  *
 61 |  * Description:
 62 |  *   Setup an xcl_world for the case when there is a single
 63 |  *   device in the system.
 64 |  *
 65 |  * Inputs:
 66 |  *   device_type - the type of device (i.e. CL_DEVICE_TYPE_ACCELERATOR)
 67 |  *
 68 |  * Returns:
 69 |  *   A struct containing the platform_id, device_id, context, and command
 70 |  *   queue.
 71 |  */
 72 | xcl_world xcl_world_single(cl_device_type device_type, 
 73 |                            const char *target_vendor, 
 74 |                            const char *target_device);
 75 | 
 76 | /* xcl_release_world
 77 |  *
 78 |  * Description:
 79 |  *   Release memory used by xcl_world struct.
 80 |  *
 81 |  * Inputs:
 82 |  *   world - xcl_world to release memory from.
 83 |  */
 84 | void xcl_release_world(xcl_world world);
 85 | 
 86 | /* xcl_import_binary
 87 |  *
 88 |  * Description:
 89 |  *   Import precompiled program (as commonly created by the Xilinx OpenCL
 90 |  *   flow).
 91 |  *
 92 |  * Inputs:
 93 |  *   world - xcl_world to import into.
 94 |  *   krnl_file - file name of the kernel to import.
 95 |  *   krnl_name - name of kernel.
 96 |  *
 97 |  * Returns:
 98 |  *   An opencl kernel object that was created from krnl_name file.
 99 |  */
100 | cl_kernel xcl_import_binary(xcl_world world, const char *krnl_file, const char *krnl_name);
101 | 
102 | /* xcl_import_source
103 |  *
104 |  * Description:
105 |  *   Import opencl source code.
106 |  *
107 |  * Inputs:
108 |  *   world - xcl_world to import into.
109 |  *   krnl_file - file name of the kernel to import.
110 |  *   krnl_name - name of kernel.
111 |  *
112 |  * Returns:
113 |  *   An opencl kernel object that was created from krnl_name file.
114 |  */
115 | cl_kernel xcl_import_source(xcl_world world, const char *krnl_file, const char *krnl_name);
116 | 
117 | /* xcl_set_kernel_arg
118 |  *
119 |  * Description:
120 |  *   Set kernel arguments
121 |  *
122 |  * Inputs:
123 |  *   krnl - kernel to set values for
124 |  *   num  - which kernel arg to set
125 |  *   size - size of argument
126 |  *   ptr  - address of value
127 |  */
128 | void xcl_set_kernel_arg(cl_kernel krnl, cl_uint num, size_t size, const void *ptr);
129 | 
130 | /* xcl_malloc
131 |  *
132 |  * Description:
133 |  *   Allocate memory for a buffer on the FPGA device.  Exit program on
134 |  *   error.
135 |  *
136 |  * Inputs:
137 |  *   world - xcl_world of the device to create buffer on.
138 |  *   flags - passed to clCreateBuffer.
139 |  *   size  - buffer size in bytes (like malloc).
140 |  */
141 | cl_mem xcl_malloc(xcl_world world, cl_mem_flags flags, size_t size);
142 | 
143 | /* xcl_memcpy_to_device/xcl_memcpy_from_device
144 |  *
145 |  * Description:
146 |  *   Copy memory from the host to the FPGA device (or vice a verse.)  The
147 |  *   memory on the FPGA must be allocated with xcl_malloc (or the lower
148 |  *   level opencl functions)
149 |  *
150 |  * Inputs:
151 |  *   world - xcl_world to copy the buffer into.
152 |  *   dest  - memory address on the FPGA to copy to.
153 |  *   src   - memory address on the host to copy from.
154 |  *   size  - number of bytes in src to copy to dest.
155 |  */
156 | void xcl_memcpy_to_device(xcl_world world, cl_mem dest, void* src,
157 |                           size_t size);
158 | void xcl_memcpy_from_device(xcl_world world, void* dest, cl_mem src,
159 |                             size_t size);
160 | 
161 | /* xcl_run_kernel3d
162 |  *
163 |  * Description:
164 |  *   Run a kernel with a 3 dimensional thread array. In this configuration,
165 |  *   there will be x*y*z threads created with a rank in each dimension.
166 |  *
167 |  * Inputs:
168 |  *   world - xcl_world to use for running the kernel.
169 |  *   krnl  - application to run on the device.
170 |  *   x     - number of threads in the x direction.
171 |  *   y     - number of threads in the y direction.
172 |  *   z     - number of threads in the z direction.
173 |  *
174 |  * Returns:
175 |  *   For purposes of benchmarking, the return of this program is the length of
176 |  *   time that the kernel took to run to completion.
177 |  */
178 | unsigned long xcl_run_kernel3d(xcl_world world, cl_kernel krnl,
179 |                                size_t x, size_t y, size_t z);
180 | 


--------------------------------------------------------------------------------
/sources/optimization_lab/idct.cpp:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2018, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 30 | #include "CL/opencl.h"
 31 | #include <vector>
 32 | #include <math.h>
 33 | #include <chrono>
 34 | #include <thread>
 35 | #include <utility>
 36 | #include <assert.h>
 37 | #include <omp.h>
 38 | #include <string.h>
 39 | #include <iostream>
 40 | #include <algorithm>
 41 | 
 42 | typedef short int16_t;
 43 | typedef unsigned short uint16_t;
 44 | 
 45 | void idctSoft(const int16_t block[64], const uint16_t q[64], int16_t outp[64], bool ignore_dc);
 46 | 
 47 | /* *************************************************************************** 
 48 | 
 49 | aligned_allocator
 50 | 
 51 | This struct provides an 4k alligned memory allocator. Using this
 52 | allocator allows data objects to be aligned for efficient data
 53 | transfer to the kernel.
 54 | 
 55 | The struct provides an allocate and deallocate function
 56 | 
 57 | *************************************************************************** */
 58 | template <typename T>
 59 | struct aligned_allocator
 60 | {
 61 |   using value_type = T;
 62 |   T* allocate(std::size_t num)
 63 |   {
 64 |     void* ptr = nullptr;
 65 |     if (posix_memalign(&ptr,4096,num*sizeof(T)))
 66 |       throw std::bad_alloc();
 67 |     return reinterpret_cast<T*>(ptr);
 68 |   }
 69 |   void deallocate(T* p, std::size_t num)
 70 |   {
 71 |     free(p);
 72 |   }
 73 | };
 74 | 
 75 | /* *************************************************************************** 
 76 | 
 77 | smalloc
 78 | 
 79 | Simple helper function to malloc memory of a specifc size. The
 80 | function will throw an error if the memory can not be successfully
 81 | allocated.
 82 | 
 83 | *************************************************************************** */
 84 | static void* smalloc(size_t size) {
 85 |   void* ptr;
 86 | 
 87 |   ptr = malloc(size);
 88 | 
 89 |   if (ptr == NULL) {
 90 |     printf("Error: Cannot allocate memory\n");
 91 |     exit(EXIT_FAILURE);
 92 |   }
 93 | 
 94 |   return ptr;
 95 | }
 96 | 
 97 | /* *************************************************************************** 
 98 | 
 99 | load_file_to_memory
100 | 
101 | This function reads from the file (filename) an xclbin into
102 | memory. This binary information is returned in the argument result.
103 | 
104 | *************************************************************************** */
105 | static int load_file_to_memory(const char *filename, char **result) {
106 |   unsigned int size;
107 | 
108 |   FILE *f = fopen(filename, "rb");
109 |   if (f == NULL) {
110 |     *result = NULL;
111 |     printf("Error: Could not read file %s\n", filename);
112 |     exit(EXIT_FAILURE);
113 |   }
114 | 
115 |   fseek(f, 0, SEEK_END);
116 |   size = ftell(f);
117 |   fseek(f, 0, SEEK_SET);
118 | 
119 |   *result = (char *) smalloc(sizeof(char)*(size+1));
120 | 
121 |   if (size != fread(*result, sizeof(char), size, f)) {
122 |     free(*result);
123 |     printf("Error: read of kernel failed\n");
124 |     exit(EXIT_FAILURE);
125 |   }
126 | 
127 |   fclose(f);
128 |   (*result)[size] = 0;
129 | 
130 |   return size;
131 | }
132 | 
133 | 
134 | /* *************************************************************************** 
135 | 
136 | oclDct
137 | 
138 | This class encapsulates all runtime kernel interaction through openCL.
139 | After the class is constructed, the objects are supposed to be
140 | initialized (init), before kernel communication and execution can be
141 | triggered through calls to write, run, and read. Once all transactions
142 | are enqueued, the user is expected to call finish to ensure all
143 | transactions are completed.
144 | 
145 | The class manages the synchronization events and allows to bulk
146 | enqueue transactions. All buffer management is performed in the oclDct
147 | class.
148 | 
149 | *************************************************************************** */
150 | class oclDct {
151 | 
152 | #define NUM_SCHED 1
153 | 
154 | public:
155 |   oclDct();
156 |   ~oclDct();
157 | 
158 |   void init(cl_context   context, 
159 |         cl_device_id device, 
160 |         cl_kernel    krnl, 
161 |         cl_command_queue q,
162 |         size_t blocks);
163 | 
164 |   void write(
165 |          size_t start,
166 |          std::vector<int16_t,aligned_allocator<int16_t>> *blocks,
167 |          std::vector<uint16_t,aligned_allocator<uint16_t>> *q,
168 |          std::vector<int16_t,aligned_allocator<int16_t>> *out,
169 |          bool ignore_dc
170 |          );
171 |   void run();
172 |   void read();
173 |   void finish();
174 | private:
175 |   cl_context        mContext;
176 |   cl_device_id      mDevice;
177 |   cl_kernel         mKernel;
178 |   cl_command_queue  mQ;
179 | 
180 |   unsigned int      mNumBlocks64;
181 |   bool              mInit;
182 |   unsigned int      mCount;
183 |   bool              mHasRun;
184 | 
185 |   cl_mem            mInBufferVec[NUM_SCHED][2];
186 |   cl_mem            mOutBufferVec[NUM_SCHED][1];
187 | 
188 |   cl_mem            *mInBuffer;
189 |   cl_mem            *mOutBuffer;
190 |   int               m_dev_ignore_dc;   
191 |   
192 |   cl_event          inEvVec[NUM_SCHED];
193 |   cl_event          runEvVec[NUM_SCHED];
194 |   cl_event          outEvVec[NUM_SCHED];
195 | 
196 | };
197 | 
198 | 
199 | /* *************************************************************************** 
200 | 
201 | oclDct Constructor
202 | 
203 | *************************************************************************** */
204 | oclDct::oclDct() {
205 |   mInit = false;
206 |   mNumBlocks64 = 0;
207 | }
208 | 
209 | 
210 | /* *************************************************************************** 
211 | 
212 | oclDct Destructor
213 | 
214 | *************************************************************************** */
215 | oclDct::~oclDct() {
216 | }
217 | 
218 | 
219 | /* *************************************************************************** 
220 | 
221 | oclDct::init
222 | 
223 | OclDct object initialization. This sets the internal state of the
224 | kernel interaction class. All general openCL objects are expected to
225 | be allocated externally and provided to the kernel interaction class.
226 | 
227 | *************************************************************************** */
228 | void oclDct::init(cl_context   context, 
229 |           cl_device_id device, 
230 |           cl_kernel    krnl, 
231 |           cl_command_queue q,
232 |           size_t numBlocks64) 
233 | {
234 |   mContext = context;
235 |   mDevice  = device;
236 |   mKernel  = krnl;
237 |   mQ       = q;
238 |   
239 |   mNumBlocks64 = numBlocks64;
240 |   
241 |   assert(mNumBlocks64 == numBlocks64); // check that there was not a truncation
242 |   mInit = true;
243 |   mCount = 0;
244 |   mHasRun = false;
245 | 
246 |   mInit = true;
247 | }
248 | 
249 | 
250 | /* *************************************************************************** 
251 | 
252 | oclDct::write
253 | 
254 | This function manages the buffer allocation for the openCL kernel
255 | interaction before actually enqueuing the operands for kernel
256 | processing. Note all buffer and event management for a complete
257 | transaction is managed in this function.
258 | 
259 | *************************************************************************** */
260 | void oclDct::write(
261 |            size_t start,
262 |            std::vector<int16_t,aligned_allocator<int16_t>> *blocks,
263 |            std::vector<uint16_t,aligned_allocator<uint16_t>> *q,
264 |            std::vector<int16_t,aligned_allocator<int16_t>> *out,
265 |            bool ignore_dc
266 |            ) {
267 | 
268 |   if(mCount == NUM_SCHED) {
269 |     mHasRun = true;
270 |     mCount = 0;
271 |   }
272 | 
273 |   if(mHasRun) {
274 |     clWaitForEvents(1, &outEvVec[mCount]);
275 | 
276 |     clReleaseMemObject(mOutBufferVec[mCount][0]);
277 |     clReleaseMemObject(mInBufferVec[mCount][0]);
278 |     clReleaseMemObject(mInBufferVec[mCount][1]);
279 | 
280 |     clReleaseEvent(outEvVec[mCount]);
281 |     clReleaseEvent(inEvVec[mCount]);
282 |     clReleaseEvent(runEvVec[mCount]);
283 | 
284 |   }
285 | 
286 |   mInBuffer = &(mInBufferVec[mCount][0]);
287 |   mOutBuffer = &(mOutBufferVec[mCount][0]);
288 | 
289 |   cl_int err;
290 |   // Move Buffer over input vector
291 |   mInBuffer[0] = clCreateBuffer(mContext, 
292 |                 CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
293 |                 mNumBlocks64*64*sizeof(int16_t), 
294 |                 blocks->data() + mNumBlocks64*64*start,
295 |                 &err);
296 | 
297 |   mInBuffer[1] = clCreateBuffer(mContext, 
298 |                 CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY,
299 |                 64*sizeof(uint16_t), 
300 |                 q->data(),
301 |                 &err);
302 |   
303 |   // Move Buffer over output vector
304 |   mOutBuffer[0] =clCreateBuffer(mContext, 
305 |                 CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY,
306 |                 mNumBlocks64*64*sizeof(int16_t), 
307 |                 out->data() + mNumBlocks64*64*start,
308 |                 &err);
309 |   
310 |   // Prepare Kernel to run
311 |   m_dev_ignore_dc = ignore_dc ? 1 : 0;
312 |   
313 | }
314 | 
315 | 
316 | /* *************************************************************************** 
317 | 
318 | oclDct::run
319 | 
320 | This function sets the kernel arguments and enqueues the kernel
321 | execution.
322 | 
323 | *************************************************************************** */
324 | void oclDct::run() {
325 |   // Set the kernel arguments
326 |   clSetKernelArg(mKernel, 0, sizeof(cl_mem), &mInBuffer[0]);
327 |   clSetKernelArg(mKernel, 1, sizeof(cl_mem), &mInBuffer[1]);
328 |   clSetKernelArg(mKernel, 2, sizeof(cl_mem), &mOutBuffer[0]);
329 |   clSetKernelArg(mKernel, 3, sizeof(int), &m_dev_ignore_dc);
330 |   clSetKernelArg(mKernel, 4, sizeof(unsigned int), &mNumBlocks64);
331 | 
332 |   // Schedule actual writing of data
333 |   clEnqueueMigrateMemObjects(mQ, 2, mInBuffer, 0, 0, nullptr, &inEvVec[mCount]);
334 | 
335 |   clEnqueueTask(mQ, mKernel, 1, &inEvVec[mCount], &runEvVec[mCount]);
336 | }
337 | 
338 | 
339 | /* *************************************************************************** 
340 | 
341 | oclDct::read
342 | 
343 | This function enqueues the read back operation of the results of the idct.
344 | 
345 | *************************************************************************** */
346 | void oclDct::read() {
347 |   clEnqueueMigrateMemObjects(mQ, 1, mOutBuffer, CL_MIGRATE_MEM_OBJECT_HOST, 1, &runEvVec[mCount], &outEvVec[mCount]);
348 |   mCount++;
349 | }
350 | 
351 | 
352 | /* *************************************************************************** 
353 | 
354 | oclDct::finish
355 | 
356 | This function ensures kernel processing has completed for all
357 | transactions and it releases the allocated opencl objects.
358 | 
359 | *************************************************************************** */
360 | void oclDct::finish() {
361 |   clFinish(mQ);
362 |   unsigned int delCount = mCount-1;
363 |   if(mHasRun) {
364 |     delCount = NUM_SCHED;
365 |   }
366 |   for(unsigned int i = 0; i< delCount; i++) {
367 |     clReleaseMemObject(mOutBufferVec[i][0]);
368 |     clReleaseMemObject(mInBufferVec[i][0]);
369 |     clReleaseMemObject(mInBufferVec[i][1]);
370 | 
371 |     clReleaseEvent(inEvVec[i]);
372 |     clReleaseEvent(runEvVec[i]);
373 |     clReleaseEvent(outEvVec[i]);
374 |   }
375 | }
376 | 
377 | 
378 | /* *************************************************************************** 
379 | 
380 | runFPGA
381 | 
382 | This function guides the kernel execution of the idct algorithm.
383 | 
384 | *************************************************************************** */
385 | void runFPGA(
386 |     size_t blocks,
387 |     std::vector<int16_t,aligned_allocator<int16_t>> &source_block,
388 |     std::vector<uint16_t,aligned_allocator<uint16_t>> &source_q,
389 |     std::vector<int16_t,aligned_allocator<int16_t>> &result_vpout,
390 |     cl_command_queue q,
391 |     bool ignore_dc,
392 |     oclDct &cu,
393 |     unsigned int numBlocks64
394 | ) {
395 |   for(size_t j = 0; j < blocks/numBlocks64; j++) {
396 |     cu.write(j, &source_block, &source_q, &result_vpout, ignore_dc);
397 |     cu.run();
398 |     cu.read();
399 |   }
400 | 
401 |   cu.finish();
402 | }
403 | 
404 | 
405 | 
406 | /* *************************************************************************** 
407 | 
408 | runCPU
409 | 
410 | This function performs the host code computation of the idct
411 | algorithm.
412 | 
413 | *************************************************************************** */
414 | void runCPU(
415 |         size_t blocks,
416 |         std::vector<int16_t,aligned_allocator<int16_t>> &source_block,
417 |         std::vector<uint16_t,aligned_allocator<uint16_t>> &source_q,
418 |         std::vector<int16_t,aligned_allocator<int16_t>> &golden_vpout,
419 |         bool ignore_dc
420 |         ) {
421 |   for(size_t i = 0; i < blocks; i++){
422 |     idctSoft(&source_block[i*64], &source_q[0], &golden_vpout[i*64], ignore_dc);
423 |   }
424 | }
425 | 
426 | 
427 | 
428 | /* *************************************************************************** 
429 | 
430 | main
431 | 
432 | This function is the main function of the idct program. It illustrates
433 | the basic opencl hostcode setup, followed by the idct execution on
434 | host (CPU) and an accelerated flow (FPGA). With a functional
435 | comparison between host and fpga exectuion.
436 | 
437 | *************************************************************************** */
438 | int main(int argc, char* argv[]) {
439 | 
440 |   char *xcl_mode = getenv("XCL_EMULATION_MODE");
441 | 
442 |   if (argc != 2) {
443 |     printf("Usage: %s <XCLBIN File>\n", argv[0]);
444 |     return EXIT_FAILURE;
445 |   }
446 | 
447 |   char* binaryName = argv[1];
448 | 
449 | 
450 |   // *********** Allocate and initialize test vectors **********
451 | 
452 |   // Blocks of 64 of int16_t
453 |   size_t blocks = 1024*1024*4;
454 | 
455 |   // Limit blocks for emulation modes
456 |   if (xcl_mode != NULL) {
457 |     blocks = 1024;
458 |   }
459 | 
460 |   bool ignore_dc = true;
461 |   
462 |   // Create input
463 |   std::vector<int16_t, aligned_allocator<int16_t>>  source_block(64*blocks);
464 |   std::vector<uint16_t, aligned_allocator<uint16_t>> source_q(64);
465 |   std::vector<int16_t, aligned_allocator<int16_t>>  golden_vpout(64*blocks);
466 |   std::vector<int16_t, aligned_allocator<int16_t>>  result_vpout(64*blocks);
467 | 
468 |   for(size_t i = 0; i < blocks; i++){
469 |     for(size_t j = 0; j < 64; j++) {
470 |       source_block[i*64 + j] = j;
471 |     }
472 |   }
473 |     
474 |   for(size_t j = 0; j < 64; j++) {
475 |     source_q[j] = j;
476 |   }
477 | 
478 | 
479 |   // *********** Communication Parameters **********
480 |   int banks = 1;
481 |   const size_t cus = banks;
482 |   const size_t threads = cus;
483 |   size_t numBlocks64 = 512; 
484 | 
485 |   if (xcl_mode != NULL) {
486 |     numBlocks64 = 256;
487 |   }
488 | 
489 |   std::cout << "FPGA number of 64*int16_t blocks per transfer: " << numBlocks64 << std::endl;
490 |   if(blocks%(threads*numBlocks64) != 0) {
491 |     std::cout << "Error: The current implementation supports only full banks to be transfered"
492 |           << " per thread" << std::endl;
493 |     exit(1);
494 |   }
495 | 
496 |   // *********** OpenCL Host Code Setup **********
497 | 
498 |   // Connect to first platform
499 |   int err;
500 |   char cl_platform_vendor[1001];
501 |   char cl_platform_name[1001];
502 |   char cl_device_name[1001];
503 | 
504 |   cl_platform_id platform_id;         // platform id
505 |   cl_device_id device_id;             // compute device id
506 |   cl_context context;                 // compute context
507 | 
508 |   // Get number of platforms
509 |   cl_uint platform_count;
510 |   clGetPlatformIDs(0, nullptr, &platform_count);
511 | 
512 |   // get all platforms
513 |   std::vector<cl_platform_id> platforms(platform_count);
514 |   clGetPlatformIDs(platform_count, platforms.data(), nullptr);
515 | 
516 |   bool found = false;
517 |   for (int p = 0; p < (int)platform_count; ++p) {  
518 |     platform_id = platforms[p];
519 |     clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL);
520 |     clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL);
521 |     if(!strcmp(cl_platform_vendor,"Xilinx")) {
522 |       found = true;
523 |       break;
524 |     }
525 |   }
526 |   if (!found){
527 |     std::cout << "Platform Not Found\n";
528 |     return err;
529 |   }
530 | 
531 |   err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ACCELERATOR, 1, &device_id, NULL);
532 |   if (err != CL_SUCCESS) {
533 |     std::cout << "FAILED TEST - Device\n";
534 |     return err;
535 |   }
536 |   
537 |   context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
538 |   if (!context || (err != CL_SUCCESS)) {
539 |     std::cout << "FAILED TEST - Context \n";
540 |     return err;
541 |   }
542 |   
543 |   clGetDeviceInfo(device_id, CL_DEVICE_NAME, 1000, (void*)cl_device_name, NULL);
544 | 
545 |   std::cout << "DEVICE: " << cl_device_name << std::endl;
546 | 
547 |   std::cout << "Loading Bitstream: " << binaryName << std::endl; 
548 |   char *krnl_bin;
549 |   size_t krnl_size;
550 |   krnl_size = load_file_to_memory(binaryName, &krnl_bin);
551 | 
552 |   printf("INFO: Loaded file\n");
553 | 
554 |   cl_program program = clCreateProgramWithBinary(context, 1,
555 |                          (const cl_device_id* ) &device_id, &krnl_size,
556 |                          (const unsigned char**) &krnl_bin,
557 |                          NULL, &err);
558 | 
559 | 
560 |   // Create Kernel
561 |   std::cout << "Create Kernel: krnl_idct" << std::endl;
562 |   cl_kernel krnl = clCreateKernel(program, "krnl_idct", &err);
563 | 
564 |   // Create Command Queue
565 |   cl_command_queue q = clCreateCommandQueue(context, device_id, 
566 |                         CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
567 | 
568 |   // Create compute units
569 |   std::cout << "Create Compute Unit" << std::endl;
570 |   oclDct cu;
571 |   cu.init(context, device_id, krnl, q, numBlocks64);
572 | 
573 |   std::cout << "Setup complete" << std::endl;
574 | 
575 | 
576 |   // *********** Host (CPU) execution **********
577 |   std::cout << "Running CPU version" << std::endl;
578 |   auto cpu_begin = std::chrono::high_resolution_clock::now();
579 |   runCPU(blocks, source_block, source_q, golden_vpout, ignore_dc);
580 |   auto cpu_end = std::chrono::high_resolution_clock::now();
581 |   
582 | 
583 |   // *********** Accelerator execution **********
584 |   std::cout << "Running FPGA version" << std::endl;
585 |   auto fpga_begin = std::chrono::high_resolution_clock::now();
586 |   runFPGA(blocks, 
587 |       source_block, 
588 |       source_q, 
589 |       result_vpout, 
590 |       q,
591 |       ignore_dc, 
592 |        cu, 
593 |       numBlocks64);
594 |   auto fpga_end = std::chrono::high_resolution_clock::now();
595 | 
596 | 
597 |   // *********** OpenCL Host Code cleanup **********
598 | 
599 |   clReleaseCommandQueue(q);
600 |   clReleaseKernel(krnl);
601 |   clReleaseProgram(program);
602 |   clReleaseContext(context);
603 | 
604 | 
605 |   // *********** Comparison (Host to Acceleration)  **********
606 | 
607 |   std::cout << "Runs complete validating results" << std::endl;
608 | 
609 |   int krnl_match = 0;
610 |   for(size_t i = 0; i < 64*blocks; i++){
611 |     if(result_vpout[i] != golden_vpout[i]){
612 |       printf("Error: Result mismatch\n");
613 |       printf("i = %d CPU result = %d Krnl Result = %d\n", 
614 |          (int) i, golden_vpout[i], result_vpout[i]);
615 |       krnl_match = 1;
616 |       break;
617 |     } 
618 |   }
619 | 
620 |   std::cout << "TEST " << (krnl_match ? "FAILED" : "PASSED") << std::endl;
621 | 
622 |   // *********** Computational Statistics  **********
623 |   //
624 |   // Only reported in the HW execution mode as wall clock time is meaningless in
625 |   // emulation.
626 |   //
627 |   if (xcl_mode == NULL) {
628 |     std::chrono::duration<double> cpu_duration = cpu_end - cpu_begin;
629 |     std::chrono::duration<double> fpga_duration = fpga_end - fpga_begin;
630 | 
631 |     std::cout << "CPU Time:        " << cpu_duration.count() << " s" << std::endl;
632 |     std::cout << "CPU Throughput:  " 
633 |           << (double) blocks*128 / cpu_duration.count() / (1024.0*1024.0)
634 |           << " MB/s" << std::endl;
635 |     std::cout << "FPGA Time:       " << fpga_duration.count() << " s" << std::endl;
636 |     std::cout << "FPGA Throughput: " 
637 |           << (double) blocks*128 / fpga_duration.count() / (1024.0*1024.0)
638 |           << " MB/s" << std::endl;
639 |     std::cout << "FPGA PCIe Throughput: " 
640 |           << (2*(double) blocks*128 + 128) / fpga_duration.count() / (1024.0*1024.0)
641 |           << " MB/s" << std::endl;
642 |   } else {
643 |     std::cout << "RUN COMPLETE" << std::endl;
644 |   }
645 | 
646 |   return (krnl_match ? EXIT_FAILURE :  EXIT_SUCCESS);
647 | }
648 | 
649 | 
650 | 
651 | /* *************************************************************************** 
652 | 
653 | idctSoft
654 | 
655 | Original software implementation of IDCT algorithm used to generate
656 | golden reference data.
657 | 
658 | *************************************************************************** */
659 | void idctSoft(const int16_t block[64], 
660 |           const uint16_t q[64], 
661 |           int16_t outp[64], 
662 |           bool ignore_dc) {
663 |   int32_t intermed[64];
664 | 
665 |   const uint16_t w1 = 2841; // 2048*sqrt(2)*cos(1*pi/16)
666 |   const uint16_t w2 = 2676; // 2048*sqrt(2)*cos(2*pi/16)
667 |   const uint16_t w3 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
668 |   const uint16_t w5 = 1609; // 2048*sqrt(2)*cos(5*pi/16)
669 |   const uint16_t w6 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
670 |   const uint16_t w7 = 565;  // 2048*sqrt(2)*cos(7*pi/16)
671 | 
672 |   const uint16_t w1pw7 = w1 + w7;
673 |   const uint16_t w1mw7 = w1 - w7;
674 |   const uint16_t w2pw6 = w2 + w6;
675 |   const uint16_t w2mw6 = w2 - w6;
676 |   const uint16_t w3pw5 = w3 + w5;
677 |   const uint16_t w3mw5 = w3 - w5;
678 | 
679 |   const uint16_t r2 = 181; // 256/sqrt(2)
680 | 
681 |   // Horizontal 1-D IDCT.
682 |   for (int y = 0; y < 8; ++y) {
683 |     int y8 = y * 8;
684 |     int32_t x0 = (((ignore_dc && y == 0)
685 |            ? 0 : (block[y8 + 0] * q[y8 + 0]) << 11)) + 128;
686 |     int32_t x1 = (block[y8 + 4] * q[y8 + 4]) << 11;
687 |     int32_t x2 = block[y8 + 6] * q[y8 + 6];
688 |     int32_t x3 = block[y8 + 2] * q[y8 + 2];
689 |     int32_t x4 = block[y8 + 1] * q[y8 + 1];
690 |     int32_t x5 = block[y8 + 7] * q[y8 + 7];
691 |     int32_t x6 = block[y8 + 5] * q[y8 + 5];
692 |     int32_t x7 = block[y8 + 3] * q[y8 + 3];
693 |     // If all the AC components are zero, then the IDCT is trivial.
694 |     if (x1 ==0 && x2 == 0 && x3 == 0 && x4 == 0 && x5 == 0 && x6 == 0 && x7 == 0) {
695 |       int32_t dc = (x0 - 128) >> 8; // coefficients[0] << 3
696 |       intermed[y8 + 0] = dc;
697 |       intermed[y8 + 1] = dc;
698 |       intermed[y8 + 2] = dc;
699 |       intermed[y8 + 3] = dc;
700 |       intermed[y8 + 4] = dc;
701 |       intermed[y8 + 5] = dc;
702 |       intermed[y8 + 6] = dc;
703 |       intermed[y8 + 7] = dc;
704 |       continue;
705 |     }
706 |         
707 |     // Prescale.
708 |         
709 |     // Stage 1.
710 |     int32_t x8 = w7 * (x4 + x5);
711 |     x4 = x8 + w1mw7*x4;
712 |     x5 = x8 - w1pw7*x5;
713 |     x8 = w3 * (x6 + x7);
714 |     x6 = x8 - w3mw5*x6;
715 |     x7 = x8 - w3pw5*x7;
716 |         
717 |     // Stage 2.
718 |     x8 = x0 + x1;
719 |     x0 -= x1;
720 |     x1 = w6 * (x3 + x2);
721 |     x2 = x1 - w2pw6*x2;
722 |     x3 = x1 + w2mw6*x3;
723 |     x1 = x4 + x6;
724 |     x4 -= x6;
725 |     x6 = x5 + x7;
726 |     x5 -= x7;
727 |         
728 |     // Stage 3.
729 |     x7 = x8 + x3;
730 |     x8 -= x3;
731 |     x3 = x0 + x2;
732 |     x0 -= x2;
733 |     x2 = (r2*(x4+x5) + 128) >> 8;
734 |     x4 = (r2*(x4-x5) + 128) >> 8;
735 |         
736 |     // Stage 4.
737 |     intermed[y8+0] = (x7 + x1) >> 8;
738 |     intermed[y8+1] = (x3 + x2) >> 8;
739 |     intermed[y8+2] = (x0 + x4) >> 8;
740 |     intermed[y8+3] = (x8 + x6) >> 8;
741 |     intermed[y8+4] = (x8 - x6) >> 8;
742 |     intermed[y8+5] = (x0 - x4) >> 8;
743 |     intermed[y8+6] = (x3 - x2) >> 8;
744 |     intermed[y8+7] = (x7 - x1) >> 8;
745 |   }
746 |     
747 |   // Vertical 1-D IDCT.
748 |   for (int32_t x = 0; x < 8; ++x) {
749 |     // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
750 |     // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
751 |     // we do not bother to check for the all-zero case.
752 |         
753 |     // Prescale.
754 |     int32_t y0 = (intermed[8*0+x] << 8) + 8192;
755 |     int32_t y1 = intermed[8*4+x] << 8;
756 |     int32_t y2 = intermed[8*6+x];
757 |     int32_t y3 = intermed[8*2+x];
758 |     int32_t y4 = intermed[8*1+x];
759 |     int32_t y5 = intermed[8*7+x];
760 |     int32_t y6 = intermed[8*5+x];
761 |     int32_t y7 = intermed[8*3+x];
762 |         
763 |     // Stage 1.
764 |     int32_t y8 = w7*(y4+y5) + 4;
765 |     y4 = (y8 + w1mw7*y4) >> 3;
766 |     y5 = (y8 - w1pw7*y5) >> 3;
767 |     y8 = w3*(y6+y7) + 4;
768 |     y6 = (y8 - w3mw5*y6) >> 3;
769 |     y7 = (y8 - w3pw5*y7) >> 3;
770 |         
771 |     // Stage 2.
772 |     y8 = y0 + y1;
773 |     y0 -= y1;
774 |     y1 = w6*(y3+y2) + 4;
775 |     y2 = (y1 - w2pw6*y2) >> 3;
776 |     y3 = (y1 + w2mw6*y3) >> 3;
777 |     y1 = y4 + y6;
778 |     y4 -= y6;
779 |     y6 = y5 + y7;
780 |     y5 -= y7;
781 |         
782 |     // Stage 3.
783 |     y7 = y8 + y3;
784 |     y8 -= y3;
785 |     y3 = y0 + y2;
786 |     y0 -= y2;
787 |     y2 = (r2*(y4+y5) + 128) >> 8;
788 |     y4 = (r2*(y4-y5) + 128) >> 8;
789 |         
790 |     // Stage 4.
791 |     outp[8*0+x] = (y7 + y1) >> 11;
792 |     outp[8*1+x] = (y3 + y2) >> 11;
793 |     outp[8*2+x] = (y0 + y4) >> 11;
794 |     outp[8*3+x] = (y8 + y6) >> 11;
795 |     outp[8*4+x] = (y8 - y6) >> 11;
796 |     outp[8*5+x] = (y0 - y4) >> 11;
797 |     outp[8*6+x] = (y3 - y2) >> 11;
798 |     outp[8*7+x] = (y7 - y1) >> 11;
799 |   }
800 | }
801 | 


--------------------------------------------------------------------------------
/sources/optimization_lab/krnl_idct.cpp:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2018, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | 
 30 | #include <string.h>
 31 | #include <stdio.h>
 32 | #include <ap_int.h>
 33 | #include <hls_stream.h>
 34 | 
 35 | typedef short int16_t;
 36 | typedef unsigned short uint16_t;
 37 | typedef int int32_t;
 38 | 
 39 | /* *************************************************************************** 
 40 | 
 41 | reg
 42 | 
 43 | Simple bridge function which is prohibited to be inlined during
 44 | synthesis which forces the insertion of registers.
 45 | 
 46 | *************************************************************************** */
 47 | template <typename reg_t>
 48 | reg_t reg(reg_t x) {
 49 |   #pragma HLS INLINE off
 50 |   return x;
 51 | }
 52 | 
 53 | 
 54 | 
 55 | /* *************************************************************************** 
 56 | 
 57 | idct
 58 | 
 59 | Idct algorithm description used to describe the actual synthesizable
 60 | idct behavior. 
 61 | 
 62 | *************************************************************************** */
 63 | void idct(const int16_t block[64], 
 64 |       const uint16_t q[64], 
 65 |       int16_t outp[64], 
 66 |       bool ignore_dc) {
 67 |   #pragma HLS INLINE
 68 | 
 69 |   int32_t intermed[64];
 70 | 
 71 |   const uint16_t w1 = 2841; // 2048*sqrt(2)*cos(1*pi/16)
 72 |   const uint16_t w2 = 2676; // 2048*sqrt(2)*cos(2*pi/16)
 73 |   const uint16_t w3 = 2408; // 2048*sqrt(2)*cos(3*pi/16)
 74 |   const uint16_t w5 = 1609; // 2048*sqrt(2)*cos(5*pi/16)
 75 |   const uint16_t w6 = 1108; // 2048*sqrt(2)*cos(6*pi/16)
 76 |   const uint16_t w7 = 565;  // 2048*sqrt(2)*cos(7*pi/16)
 77 |   
 78 |   const uint16_t w1pw7 = w1 + w7;
 79 |   const uint16_t w1mw7 = w1 - w7;
 80 |   const uint16_t w2pw6 = w2 + w6;
 81 |   const uint16_t w2mw6 = w2 - w6;
 82 |   const uint16_t w3pw5 = w3 + w5;
 83 |   const uint16_t w3mw5 = w3 - w5;
 84 |   
 85 |   const uint16_t r2 = 181; // 256/sqrt(2)
 86 |   
 87 |   // Horizontal 1-D IDCT.
 88 |   for (int y = 0; y < 8; ++y) {
 89 |     int y8 = y * 8;
 90 |     int32_t x0 = (((ignore_dc && y == 0)
 91 |            ? 0 : (block[y8 + 0] * q[y8 + 0]) << 11)) + 128;
 92 |     int32_t x1 = (block[y8 + 4] * q[y8 + 4]) << 11;
 93 |     int32_t x2 = block[y8 + 6] * q[y8 + 6];
 94 |     int32_t x3 = block[y8 + 2] * q[y8 + 2];
 95 |     int32_t x4 = block[y8 + 1] * q[y8 + 1];
 96 |     int32_t x5 = block[y8 + 7] * q[y8 + 7];
 97 |     int32_t x6 = block[y8 + 5] * q[y8 + 5];
 98 |     int32_t x7 = block[y8 + 3] * q[y8 + 3];
 99 |     // If all the AC components are zero, then the IDCT is trivial.
100 |     if (x1 ==0 && x2 == 0 && x3 == 0 && x4 == 0 && x5 == 0 && x6 == 0 && x7 == 0) {
101 |       int32_t dc = (x0 - 128) >> 8; // coefficients[0] << 3
102 |       intermed[y8 + 0] = dc;
103 |       intermed[y8 + 1] = dc;
104 |       intermed[y8 + 2] = dc;
105 |       intermed[y8 + 3] = dc;
106 |       intermed[y8 + 4] = dc;
107 |       intermed[y8 + 5] = dc;
108 |       intermed[y8 + 6] = dc;
109 |       intermed[y8 + 7] = dc;
110 |       continue;
111 |     }
112 | 
113 |     // Prescale.
114 | 
115 |     // Stage 1.
116 |     int32_t x8 = w7 * (x4 + x5);
117 |     x4 = x8 + w1mw7*x4;
118 |     x5 = x8 - w1pw7*x5;
119 |     x8 = w3 * (x6 + x7);
120 |     x6 = x8 - w3mw5*x6;
121 |     x7 = x8 - w3pw5*x7;
122 | 
123 |     // Stage 2.
124 |     x8 = x0 + x1;
125 |     x0 -= x1;
126 |     x1 = w6 * (x3 + x2);
127 |     x2 = x1 - w2pw6*x2;
128 |     x3 = x1 + w2mw6*x3;
129 |     x1 = x4 + x6;
130 |     x4 -= x6;
131 |     x6 = x5 + x7;
132 |     x5 -= x7;
133 | 
134 |     // Stage 3.
135 |     x7 = x8 + x3;
136 |     x8 -= x3;
137 |     x3 = x0 + x2;
138 |     x0 -= x2;
139 |     x2 = (r2*(x4+x5) + 128) >> 8;
140 |     x4 = (r2*(x4-x5) + 128) >> 8;
141 | 
142 |     // Stage 4.
143 |     intermed[y8+0] = (x7 + x1) >> 8;
144 |     intermed[y8+1] = (x3 + x2) >> 8;
145 |     intermed[y8+2] = (x0 + x4) >> 8;
146 |     intermed[y8+3] = (x8 + x6) >> 8;
147 |     intermed[y8+4] = (x8 - x6) >> 8;
148 |     intermed[y8+5] = (x0 - x4) >> 8;
149 |     intermed[y8+6] = (x3 - x2) >> 8;
150 |     intermed[y8+7] = (x7 - x1) >> 8;
151 |   }
152 | 
153 |   // Vertical 1-D IDCT.
154 |   for (int32_t x = 0; x < 8; ++x) {
155 |     // Similar to the horizontal 1-D IDCT case, if all the AC components are zero, then the IDCT is trivial.
156 |     // However, after performing the horizontal 1-D IDCT, there are typically non-zero AC components, so
157 |     // we do not bother to check for the all-zero case.
158 | 
159 |     // Prescale.
160 |     int32_t y0 = (intermed[8*0+x] << 8) + 8192;
161 |     int32_t y1 = intermed[8*4+x] << 8;
162 |     int32_t y2 = intermed[8*6+x];
163 |     int32_t y3 = intermed[8*2+x];
164 |     int32_t y4 = intermed[8*1+x];
165 |     int32_t y5 = intermed[8*7+x];
166 |     int32_t y6 = intermed[8*5+x];
167 |     int32_t y7 = intermed[8*3+x];
168 | 
169 |     // Stage 1.
170 |     int32_t y8 = reg<int32_t>(w7*reg<int32_t>(y4+y5)) + 4;
171 |     y4 = (y8 + reg<int32_t>(w1mw7*y4)) >> 3;
172 |     y5 = (y8 - reg<int32_t>(w1pw7*y5)) >> 3;
173 |     y8 = reg<int32_t>(w3*reg<int32_t>(y6+y7)) + 4;
174 |     y6 = (y8 - reg<int32_t>(w3mw5*y6)) >> 3;
175 |     y7 = (y8 - reg<int32_t>(w3pw5*y7)) >> 3;
176 | 
177 |     // Stage 2.
178 |     y8 = y0 + y1;
179 |     y0 -= y1;
180 |     y1 = reg<int32_t>(w6*reg<int32_t>(y3+y2)) + 4;
181 |     y2 = (y1 - reg<int32_t>(w2pw6*y2)) >> 3;
182 |     y3 = (y1 + reg<int32_t>(w2mw6*y3)) >> 3;
183 |     y1 = y4 + y6;
184 |     y4 -= y6;
185 |     y6 = y5 + y7;
186 |     y5 -= y7;
187 | 
188 |     // Stage 3.
189 |     y7 = y8 + y3;
190 |     y8 -= y3;
191 |     y3 = y0 + y2;
192 |     y0 -= y2;
193 |     y2 = (reg<int32_t>(r2*reg<int32_t>(y4+y5)) + 128) >> 8;
194 |     y4 = (reg<int32_t>(r2*reg<int32_t>(y4-y5)) + 128) >> 8;
195 | 
196 |     // Stage 4.
197 |     outp[8*0+x] = (y7 + y1) >> 11;
198 |     outp[8*1+x] = (y3 + y2) >> 11;
199 |     outp[8*2+x] = (y0 + y4) >> 11;
200 |     outp[8*3+x] = (y8 + y6) >> 11;
201 |     outp[8*4+x] = (y8 - y6) >> 11;
202 |     outp[8*5+x] = (y0 - y4) >> 11;
203 |     outp[8*6+x] = (y3 - y2) >> 11;
204 |     outp[8*7+x] = (y7 - y1) >> 11;
205 |   }
206 | }
207 | 
208 | typedef ap_uint<512> uint512_t;
209 | typedef ap_int<512> int512_t;
210 | 
211 | 
212 | 
213 | /* *************************************************************************** 
214 | 
215 | read_blocks
216 | 
217 | Dataflow block used to interface from input memory to streaming input
218 | channels.
219 | 
220 | *************************************************************************** */
221 | template<typename out_t>
222 | void read_blocks(const out_t *in, hls::stream<out_t> &out, unsigned int blocks) {
223 |     for(unsigned int i = 0; i < blocks*2; i++) {
224 |     #pragma HLS loop_tripcount min=2048 max=2048
225 |     #pragma HLS PIPELINE II=1
226 |         out.write(in[i]);
227 |   }
228 | }
229 | 
230 | 
231 | 
232 | /* *************************************************************************** 
233 | 
234 | execute
235 | 
236 | Dataflow block used to manage full block computation. It uses wide
237 | arrays for single block computation to allow efficient access with
238 | ii=2 for the 8x8 data elements. 
239 | 
240 | *************************************************************************** */
241 | void execute(hls::stream<int512_t> &iblock, 
242 |          hls::stream<uint512_t> &iq, 
243 |          hls::stream<int512_t> &ivoutp, 
244 |          bool ignore_dc, 
245 |          unsigned int blocks) {
246 |   for(unsigned int i = 0; i < blocks; i++) {
247 |     /* Use II=2 here as we this will equalize all the dataflow processes and
248 |      * save resources */
249 |   #pragma HLS loop_tripcount min=1024 max=1024
250 |   #pragma HLS PIPELINE II=2
251 |     
252 |     int16_t  iiblock[64];
253 |     uint16_t iiq[64];
254 |     int16_t  iivoutp[64];
255 | 
256 |     for(short j = 0; j < 64/32; j++) {
257 |       if(i==0) {
258 |     ap_uint<512> tmp;
259 |     tmp = iq.read();
260 |     for(short k = 0; k < 32; k++) {
261 |       iiq[j*32+k] = tmp(16*(k+1)-1, 16*k);
262 |     }
263 |       }
264 |     }
265 |     
266 |     for(short j = 0; j < 64/32; j++) {
267 |       ap_int<512> tmp;
268 |       tmp = iblock.read();
269 |       for(short k = 0; k < 32; k++) {
270 |     iiblock[j*32+k] = tmp(16*(k+1)-1, 16*k);
271 |       }
272 |     }
273 |     
274 |     idct(iiblock, iiq, iivoutp, ignore_dc);
275 |     
276 |     for(short j = 0; j < 64/32; j++) {
277 |       ap_int<512> tmp;
278 |       for(short k = 0; k < 32; k++) {
279 |     tmp(16*(k+1)-1, 16*k) = iivoutp[j*32+k];
280 |       }
281 |       ivoutp.write(tmp);
282 |     }
283 |   }
284 | }
285 | 
286 | 
287 | 
288 | /* *************************************************************************** 
289 | 
290 | write_blocks
291 | 
292 | Dataflow block used to interface from streaming output channel to
293 | output memory.
294 | 
295 | *************************************************************************** */
296 | void write_blocks(ap_int<512> *out, hls::stream<int512_t> &in, unsigned int blocks) {
297 |   for(unsigned int i = 0; i < blocks*2; i++) {
298 |   #pragma HLS loop_tripcount min=2048 max=2048
299 |   #pragma HLS PIPELINE II=1
300 |     out[i] = in.read();
301 |   }
302 | }
303 | 
304 | 
305 | 
306 | /* *************************************************************************** 
307 | 
308 | krnl_idct_dataflow
309 | 
310 | Top idct kernel function, used to clearly isolate and identify
311 | dataflow blocks.
312 | 
313 | *************************************************************************** */
314 | void krnl_idct_dataflow(const ap_int<512> *block, 
315 |             const ap_uint<512> *q, 
316 |             ap_int<512> *voutp, 
317 |             int ignore_dc, 
318 |             unsigned int blocks) {
319 |   //#pragma HLS DATAFLOW
320 | 
321 |   hls::stream<int512_t> iblock("input_stream1");
322 |   hls::stream<uint512_t> iq("input_stream2");
323 |   hls::stream<int512_t> ivoutp("output_stream");
324 |   #pragma  HLS stream variable=iblock depth=512
325 |   #pragma  HLS stream variable=iq     depth=2
326 |   #pragma  HLS stream variable=ivoutp depth=512
327 | 
328 | 
329 |   read_blocks<uint512_t>(q, iq, 1);
330 |   read_blocks<int512_t>(block, iblock, blocks);
331 |   execute(iblock, iq, ivoutp, ignore_dc ? true : false, blocks);
332 |   write_blocks(voutp, ivoutp, blocks);
333 | }
334 | 
335 | 
336 | 
337 | /* *************************************************************************** 
338 | 
339 | krnl_idct
340 | 
341 | Kernel idct interface definition. 
342 | 
343 | *************************************************************************** */
344 | extern "C" {
345 | void krnl_idct(const ap_int<512> *block, 
346 |            const ap_uint<512> *q, 
347 |            ap_int<512> *voutp, 
348 |            int ignore_dc, 
349 |            unsigned int blocks) {
350 |   #pragma HLS INTERFACE m_axi     port=block     offset=slave bundle=gmem0
351 |   #pragma HLS INTERFACE s_axilite port=block                  bundle=control
352 |   #pragma HLS INTERFACE m_axi     port=q         offset=slave bundle=gmem1
353 |   #pragma HLS INTERFACE s_axilite port=q                      bundle=control
354 |   #pragma HLS INTERFACE m_axi     port=voutp     offset=slave bundle=gmem2
355 |   #pragma HLS INTERFACE s_axilite port=voutp                  bundle=control
356 |   #pragma HLS INTERFACE s_axilite port=ignore_dc              bundle=control
357 |   #pragma HLS INTERFACE s_axilite port=blocks                 bundle=control
358 |   #pragma HLS INTERFACE s_axilite port=return                 bundle=control
359 | 
360 |   krnl_idct_dataflow(block, q, voutp, ignore_dc, blocks);
361 | }
362 | 
363 | }
364 | 


--------------------------------------------------------------------------------