前面已经写了系列一:https://blog.csdn.net/yongjiankuang/article/details/102470457,系列一主要是tensorflow对mnist进行模型训练,然后将训练好的参数导出来。本博文就是利用导出来的参数,搭建c代码的mnist前向网络。具体实现如下:
#ifndef __COMMON_H_
#define __COMMON_H_
#include <iostream>
#include <stdint.h>
using namespace std;
#define u8 unsigned char
#define s8 char
#define u16 unsigned short
#define s16 short
#define u32 unsigned int
#define s32 int
#define f32 float
extern f32 W_conv1_0[];
extern f32 b_conv1_0[];
extern f32 W_conv2_0[];
extern f32 b_conv2_0[];
extern f32 W_fc1_0[];
extern f32 b_fc1_0[];
extern f32 W_fc2_0[];
extern f32 b_fc2_0[];
#endif
layer.h和layer.cpp主要是声明网络层
#ifndef __LAYER_H_
#define __LAYER_H_
#include "common.h"
int arm_convolve_HWC_f32_basic(f32 * Im_in,
const u16 dim_im_in_X,
const u16 dim_im_in_Y,
const u16 ch_im_in,
f32 * wt,
const u16 ch_im_out,
const u16 dim_kernel_X,
const u16 dim_kernel_Y,
const u16 padding_X,
const u16 padding_Y,
const u16 stride_X,
const u16 stride_Y,
f32 * bias,
f32 * Im_out,
const u16 dim_im_out_X,
const u16 dim_im_out_Y
);
int arm_relu_f32(f32 *data, u16 size);
int arm_maxpool_f32_HWC(f32 * Im_in,
const u16 dim_im_in,
const u16 ch_im_in,
const u16 dim_kernel,
const u16 padding,
const u16 stride, const u16 dim_im_out, f32 * Im_out);
int arm_fully_connected_f32(f32 * pV,
f32 * pM,
const u16 dim_vec,
const u16 num_of_rows,
f32 * bias, f32 * pOut);
int arm_softmax_f32(f32 * vec_in, const u16 dim_vec, f32 * p_out);
#endif
#include "layer.h"
//conv layer
int arm_convolve_HWC_f32_basic(f32 * Im_in,
const u16 dim_im_in_X,
const u16 dim_im_in_Y,
const u16 ch_im_in,
f32 * wt,
const u16 ch_im_out,
const u16 dim_kernel_X,
const u16 dim_kernel_Y,
const u16 padding_X,
const u16 padding_Y,
const u16 stride_X,
const u16 stride_Y,
f32 * bias,
f32 * Im_out,
const u16 dim_im_out_X,
const u16 dim_im_out_Y
)
{
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
u16 i, j, k, l, m, n;
f32 conv_out;
signed char in_row, in_col;
for (i = 0; i < ch_im_out; i++)
{
for (j = 0; j < dim_im_out_Y; j++)
{
for (k = 0; k < dim_im_out_X; k++)
{
conv_out = bias[i];
for (m = 0; m < dim_kernel_Y; m++)
{
for (n = 0; n < dim_kernel_X; n++)
{
// if-for implementation
in_row = stride_Y * j + m - padding_Y;
in_col = stride_X * k + n - padding_X;
if (in_row >= 0 && in_col >= 0 && in_row < dim_im_in_Y && in_col < dim_im_in_X)
{
for (l = 0; l < ch_im_in; l++)
{
conv_out +=
Im_in[(in_row * dim_im_in_X + in_col) * ch_im_in +
l] * wt[i * ch_im_in * dim_kernel_X * dim_kernel_Y + (m * dim_kernel_X +
n) * ch_im_in + l];
}
}
}
}
Im_out[i + (j * dim_im_out_X + k) * ch_im_out] = conv_out;
}
}
}
return 0;
}
//relu
int arm_relu_f32(f32 *data, u16 size)
{
u16 i;
for (i = 0; i < size; i++)
{
if (data[i] < 0)
data[i] = 0;
}
return 0;
}
//maxpool layer
int arm_maxpool_f32_HWC(f32 * Im_in,
const u16 dim_im_in,
const u16 ch_im_in,
const u16 dim_kernel,
const u16 padding,
const u16 stride, const u16 dim_im_out, f32 * Im_out)
{
/* Run the following code as reference implementation for Cortex-M0 and Cortex-M3 */
s16 i_ch_in, i_x, i_y;
s16 k_x, k_y;
for (i_ch_in = 0; i_ch_in < ch_im_in; i_ch_in++)
{
for (i_y = 0; i_y < dim_im_out; i_y++)
{
for (i_x = 0; i_x < dim_im_out; i_x++)
{
f32 max = -10000;
for (k_y = i_y * stride - padding; k_y < i_y * stride - padding + dim_kernel; k_y++)
{
for (k_x = i_x * stride - padding; k_x < i_x * stride - padding + dim_kernel; k_x++)
{
if (k_y >= 0 && k_x >= 0 && k_y < dim_im_in && k_x < dim_im_in)
{
if (Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)] > max)
{
max = Im_in[i_ch_in + ch_im_in * (k_x + k_y * dim_im_in)];
}
}
}
}
Im_out[i_ch_in + ch_im_in * (i_x + i_y * dim_im_out)] = max;
}
}
}
return 0;
}
//full connected layer
int arm_fully_connected_f32(f32 * pV,
f32 * pM,
const u16 dim_vec,
const u16 num_of_rows,
f32 * bias, f32 * pOut)
{
int i, j;
for (i = 0; i < num_of_rows; i++)
{
f32 ip_out = bias[i];
for (j = 0; j < dim_vec; j++)
{
ip_out += pV[j] * pM[i * dim_vec + j];
}
pOut[i] = ip_out;
}
return 0;
}
//softmax layer
int arm_softmax_f32(f32 * vec_in, const u16 dim_vec, f32 * p_out)
{
f32 sum = 0;
s16 i;
f32 base;
base = vec_in[0];
for (i = 0; i < dim_vec; i++)
{
if (vec_in[i] > base)
{
base = vec_in[i];
}
}
for (i = 0; i < dim_vec; i++)
{
p_out[i] = exp(vec_in[i] - base);
sum += p_out[i];
}
for (i = 0; i < dim_vec; i++)
{
p_out[i] = p_out[i] / sum;
}
return 0;
}
下面是main函数
#include "layer.h"
#include "input.h"
void main()
{
//malloc for layers
//first layer conv
int size = 28 * 28 * 1;
f32 *conv1_out = (f32*)malloc(size * sizeof(f32));
memset(conv1_out, 0, size * sizeof(f32));
//second layer pool
size = 14 * 14 * 32;
f32 *p1_out = (f32*)malloc(size * sizeof(f32));
memset(p1_out, 0, size * sizeof(f32));
//third layer conv
size = 14 * 14 * 64;
f32 *conv2_out = (f32*)malloc(size * sizeof(f32));
memset(conv2_out, 0, size * sizeof(f32));
//four layer pool
size = 7 * 7 * 64;
f32 *p2_out = (f32*)malloc(size * sizeof(f32));
memset(p2_out,0,size * sizeof(f32));
//five layer full connected
size = 128;
f32 *fc1_out = (f32*)malloc(size * sizeof(f32));
memset(fc1_out,0,size * sizeof(f32));
//six layer full connected
size = 10;
f32 *fc2_out = (f32*)malloc(size * sizeof(f32));
memset(fc2_out,0,size * sizeof(f32));
//softmax
f32 *sf_out = (f32*)malloc(size * sizeof(f32));
memset(sf_out,0,size * sizeof(f32));
//------forward (conv1 -> pool1 -> conv2 -> pool2 -> fc1 -> fc2 -> softmax)
//conv1
f32 in_arr[] = input;
arm_convolve_HWC_f32_basic(in_arr,28,28,1,W_conv1_0,32,5,5,2,2,1,1,b_conv1_0,conv1_out,28,28);
arm_relu_f32(conv1_out,28 * 28 * 32);
//pool1
arm_maxpool_f32_HWC(conv1_out,28,32,2,0,2,14,p1_out);
//conv2
arm_convolve_HWC_f32_basic(p1_out,14,14,32,W_conv2_0,64,5,5,2,2,1,1,b_conv2_0,conv2_out,14,14);
arm_relu_f32(conv2_out,14 * 14 * 64);
//pool2
arm_maxpool_f32_HWC(conv2_out,14,64,2,0,2,7,p2_out);
//full connected
arm_fully_connected_f32(p2_out,W_fc1_0,7 * 7 * 64,128,b_fc1_0,fc1_out);
arm_relu_f32(fc1_out,128);
//softmax
arm_fully_connected_f32(fc1_out,W_fc2_0,128,10,b_fc2_0,fc2_out);
arm_softmax_f32(fc2_out, 10, sf_out);
free(conv1_out);
free(p1_out);
free(conv2_out);
free(p2_out);
free(fc1_out);
free(fc2_out);
system("pause");
}
这里没有加入params.cpp文件,因为数据量有些大,这里就不一一展示了,params.cpp可以又系列一产生。通过系列一,二可以很容易将tensorflow的模型部署到嵌入式,相信你同样可以做到。水平有限,有不当之处请指教,谢谢!
来源:https://blog.csdn.net/yongjiankuang/article/details/102470759