RISC-V MCU中文社区

【分享】 HLS 全连接层加速器开发

发表于 全国大学生集成电路创新创业大赛 2021-07-24 16:15:15
0
2207
0

一、队伍介绍

队伍名称:Micro_423 队伍编号:CICC1195 大家好,本篇是我们队伍的第五篇分享,主要介绍一下HLS 全连接层加速器开发。

二、具体方法

.c.h代码如下,主体功能就是实现了一个1*100的矩阵输入,一共经过了3层的计算,每一层都是全连接和Selu映射,实现非常简单,具体细节看代码即可。实现了1*100->1*10000->1*20->1*20->1*1的一个层级计算关系,并且通过#pragma来把接口配置成axi模式。

#include#include "hls_linear_algebra.h"#include "hls_math.h"#include "hunyihun.h"//mat_a_t a[MAT_A_ROWS][MAT_A_COLS],b[MAT_B_ROWS][MAT_B_COLS];//result_t res[MAT_A_ROWS][MAT_B_COLS];//#include //#define DELAY 50000000//#pragma HLS INTERFACE m_axi depth=20 port=res   offset=slave bundle=MASTER_BUS//#pragma HLS INTERFACE m_axi depth=1  port=res_2 offset=slave bundle=MASTER_BUS//100*1->10000*1->20*1->20*1->1*1void hunyihun(mat_t in[MAT_INPUT],mat_t layer1w[MAT_B_ROWS][MAT_B_COLS],mat_t layer3w[MAT_B_COLS][MAT_A_ROWS], mat_t layer2w[MAT_B_COLS][MAT_B_COLS],    mat_t res[MAT_A_ROWS][MAT_B_COLS],mat_t res_2[MAT_A_ROWS][MAT_B_COLS],mat_t res_3[MAT_A_ROWS][MAT_A_ROWS], mat_t layer1b[MAT_B_COLS],mat_t layer2b[MAT_B_COLS],mat_t layer3b[MAT_A_ROWS]){#pragma HLS INTERFACE s_axilite port=in    bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=layer1w     bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=layer3w     bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=layer2w     bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=layer1b     bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=layer3b     bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=layer2b     bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=res   bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=res_2 bundle=CTRL_BUS#pragma HLS INTERFACE s_axilite port=res_3 bundle=CTRL_BUS#pragma HLS INTERFACE ap_ctrl_none port=return  float min,max;    mat_t res_middle[MAT_A_ROWS][MAT_B_COLS];    mat_t res_middle_2[MAT_A_ROWS][MAT_B_COLS];    //mat_t a[MAT_A_ROWS][MAT_A_COLS]={0};    mat_t res_layer1addbias [MAT_B_COLS];    mat_t res_layer2addbias [MAT_B_COLS];    mat_t in_2[MAT_A_ROWS][MAT_INPUT];    mat_t res_final[MAT_A_ROWS][MAT_A_ROWS];  min = 1000.0;  max = 0.0;  //get_max+min    for(int i=0;i    {      in_2[0][i] = in[i];//(in[i]-min)/(max-min)*(MAT_INPUT-1);    }  //[1*10000]*[10000*20]=[1*20];  hls::matrix_multiplymat_t,mat_t>(in_2,layer1w,res);  //layer 1 : kx+b  for(int i=0;i<20;i++)  {    res_layer1addbias[i] = res[0][i] + layer1b[i];  }  //selu [1*20]  for(int i=0;i<20;i++)  {       res_middle[0][i] = scale * (res_layer1addbias[i] > 0 ? res_layer1addbias[i] : alpha * (hls::exp(res_layer1addbias[i])-1) );  }  //[1*20]*[20*20]=[1*20]  hls::matrix_multiplymat_t,mat_t>(res_middle,layer2w,res_2);  //layer 1 : kx+b  for(int i=0;i<20;i++)  {    res_layer2addbias[i] = res_2[0][i] + layer2b[i];  }  //selu [1*20]  for(int i=0;i<20;i++)  {       res_middle_2[0][i] = scale * (res_layer2addbias[i] > 0 ? res_layer2addbias[i] : alpha * (hls::exp(res_layer2addbias[i])-1) );  }    //[1*20]*[20*1]=[1*1]  hls::matrix_multiplymat_t,mat_t>(res_middle_2,layer3w,res_final);  res_3[0][0]=res_final[0][0]+layer3b[0];    min = 1000.0;    max=0.0;}



#include "ap_cint.h"#define MAT_A_ROWS 1#define MAT_A_COLS 100#define MAT_B_ROWS 100#define MAT_B_COLS 20#define MAT_INPUT 100#define scale 1.0507009873554804934193349852946#define alpha 1.6732632423543772848170429916717typedef float mat_t;typedef float result_t;typedef int1  mat_int1;
void hunyihun(mat_t in[MAT_INPUT],mat_t layer1w[MAT_B_ROWS][MAT_B_COLS],mat_t layer3w[MAT_B_COLS][MAT_A_ROWS], mat_t layer2w[MAT_B_COLS][MAT_B_COLS], mat_t res[MAT_A_ROWS][MAT_B_COLS],mat_t res_2[MAT_A_ROWS][MAT_B_COLS],mat_t res_3[MAT_A_ROWS][MAT_A_ROWS], mat_t layer1b[MAT_B_COLS],mat_t layer2b[MAT_B_COLS],mat_t layer3b[MAT_A_ROWS]);

具体如何生成rtl级别的hls IP自行百度即可,傻瓜式一键操作即可生成。具体的地址分配由HLS软件自动生成(hw.h结尾的文件,在工程中是可以找到的,对照这些地址写入数据即可),代码如下:


// ==============================================================// Vivado(TM) HLS - High-Level Synthesis from C, C++ and SystemC v2019.1 (64-bit)// Copyright 1986-2019 Xilinx, Inc. All Rights Reserved.// ==============================================================// CTRL_BUS// 0x0200 ~// 0x03ff : Memory 'in_r' (100 * 32b)//          Word n : bit [31:0] - in_r[n]// 0x2000 ~// 0x3fff : Memory 'layer1w' (2000 * 32b)//          Word n : bit [31:0] - layer1w[n]// 0x4000 ~// 0x407f : Memory 'layer3w_0' (20 * 32b)//          Word n : bit [31:0] - layer3w_0[n]// 0x4800 ~// 0x4fff : Memory 'layer2w' (400 * 32b)//          Word n : bit [31:0] - layer2w[n]// 0x5000 ~// 0x507f : Memory 'res' (20 * 32b)//          Word n : bit [31:0] - res[n]// 0x5080 ~// 0x50ff : Memory 'res_2' (20 * 32b)//          Word n : bit [31:0] - res_2[n]// 0x5100 ~// 0x5107 : Memory 'res_3' (1 * 32b)//          Word n : bit [31:0] - res_3[n]// 0x5180 ~// 0x51ff : Memory 'layer1b' (20 * 32b)//          Word n : bit [31:0] - layer1b[n]// 0x5200 ~// 0x527f : Memory 'layer2b' (20 * 32b)//          Word n : bit [31:0] - layer2b[n]// 0x5280 ~// 0x5287 : Memory 'layer3b' (1 * 32b)//          Word n : bit [31:0] - layer3b[n]// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
#define XHUNYIHUN_CTRL_BUS_ADDR_IN_R_BASE 0x0200#define XHUNYIHUN_CTRL_BUS_ADDR_IN_R_HIGH 0x03ff#define XHUNYIHUN_CTRL_BUS_WIDTH_IN_R 32#define XHUNYIHUN_CTRL_BUS_DEPTH_IN_R 100#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER1W_BASE 0x2000#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER1W_HIGH 0x3fff#define XHUNYIHUN_CTRL_BUS_WIDTH_LAYER1W 32#define XHUNYIHUN_CTRL_BUS_DEPTH_LAYER1W 2000#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER3W_0_BASE 0x4000#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER3W_0_HIGH 0x407f#define XHUNYIHUN_CTRL_BUS_WIDTH_LAYER3W_0 32#define XHUNYIHUN_CTRL_BUS_DEPTH_LAYER3W_0 20#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER2W_BASE 0x4800#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER2W_HIGH 0x4fff#define XHUNYIHUN_CTRL_BUS_WIDTH_LAYER2W 32#define XHUNYIHUN_CTRL_BUS_DEPTH_LAYER2W 400#define XHUNYIHUN_CTRL_BUS_ADDR_RES_BASE 0x5000#define XHUNYIHUN_CTRL_BUS_ADDR_RES_HIGH 0x507f#define XHUNYIHUN_CTRL_BUS_WIDTH_RES 32#define XHUNYIHUN_CTRL_BUS_DEPTH_RES 20#define XHUNYIHUN_CTRL_BUS_ADDR_RES_2_BASE 0x5080#define XHUNYIHUN_CTRL_BUS_ADDR_RES_2_HIGH 0x50ff#define XHUNYIHUN_CTRL_BUS_WIDTH_RES_2 32#define XHUNYIHUN_CTRL_BUS_DEPTH_RES_2 20#define XHUNYIHUN_CTRL_BUS_ADDR_RES_3_BASE 0x5100#define XHUNYIHUN_CTRL_BUS_ADDR_RES_3_HIGH 0x5107#define XHUNYIHUN_CTRL_BUS_WIDTH_RES_3 32#define XHUNYIHUN_CTRL_BUS_DEPTH_RES_3 1#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER1B_BASE 0x5180#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER1B_HIGH 0x51ff#define XHUNYIHUN_CTRL_BUS_WIDTH_LAYER1B 32#define XHUNYIHUN_CTRL_BUS_DEPTH_LAYER1B 20#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER2B_BASE 0x5200#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER2B_HIGH 0x527f#define XHUNYIHUN_CTRL_BUS_WIDTH_LAYER2B 32#define XHUNYIHUN_CTRL_BUS_DEPTH_LAYER2B 20#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER3B_BASE 0x5280#define XHUNYIHUN_CTRL_BUS_ADDR_LAYER3B_HIGH 0x5287#define XHUNYIHUN_CTRL_BUS_WIDTH_LAYER3B 32#define XHUNYIHUN_CTRL_BUS_DEPTH_LAYER3B 1

以上就是简单HLS生成卷积和全连接层加速器的示例流程,对于完整的加速器设计可以通过这种方法进行敏捷开发设计,进而进行与E203 RISC-V核的集成。

喜欢0
用户评论
zsf1998

zsf1998 实名认证

学习

积分
问答
粉丝
关注
  • RV-STAR 开发板
  • RISC-V处理器设计系列课程
  • 培养RISC-V大学土壤 共建RISC-V教育生态
RV-STAR 开发板