CMSIS-NN
Version 1.2.0
CMSIS NN Software Library
|
Macros | |
#define | USE_INTRINSIC |
Functions | |
arm_status | arm_convolve_s8 (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint16_t input_batches, const q7_t *kernel, const uint16_t output_ch, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t output_x, const uint16_t output_y, q15_t *buffer_a) |
Basic s8 convolution function. More... | |
int32_t | arm_convolve_s8_get_buffer_size (const uint16_t input_ch, const uint16_t kernel_x, const uint16_t kernel_y) |
Get the required buffer size for s8 convolution function. More... | |
arm_status | arm_convolve_HWC_q7_basic (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Basic Q7 convolution function. More... | |
arm_status | arm_convolve_HWC_q7_basic_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Basic Q7 convolution function (non-square shape) More... | |
arm_status | arm_convolve_HWC_q15_basic (const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Basic Q15 convolution function. More... | |
arm_status | arm_convolve_HWC_q7_fast (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 convolution function. More... | |
arm_status | arm_convolve_HWC_q7_fast_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 convolution function (non-sqaure shape) More... | |
arm_status | arm_convolve_1x1_HWC_q7_fast_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 version of 1x1 convolution (non-sqaure shape) More... | |
arm_status | arm_convolve_1x1_s8_fast (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint16_t input_batches, const q7_t *kernel, const uint16_t output_ch, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t input_offset, const int32_t out_activation_min, const int32_t out_activation_max, const uint16_t output_x, const uint16_t output_y, q15_t *buffer_a) |
Fast s8 version for 1x1 convolution (non-square shape) More... | |
int32_t | arm_convolve_1x1_s8_fast_get_buffer_size (const uint16_t input_ch) |
Get the required buffer size for the fast 1x1 convolution (non-square shape) s8 convolution function. More... | |
arm_status | arm_convolve_HWC_q7_RGB (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Q7 version of convolution for RGB image. More... | |
arm_status | arm_convolve_HWC_q15_fast (const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Fast Q15 convolution function. More... | |
arm_status | arm_convolve_HWC_q15_fast_nonsquare (const q15_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Fast Q15 convolution function (non-sqaure shape) More... | |
arm_status | arm_depthwise_separable_conv_HWC_q7 (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Q7 depthwise separable convolution function. More... | |
arm_status | arm_depthwise_separable_conv_HWC_q7_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Q7 depthwise separable convolution function (non-square shape) More... | |
arm_status | arm_depthwise_conv_s8 (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const q7_t *kernel, const uint16_t output_ch, const uint16_t ch_mult, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y, q15_t *buffer_a) |
Basic s8 depthwise convolution function. More... | |
arm_status | arm_depthwise_conv_s8_opt (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const q7_t *kernel, const uint16_t output_ch, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y, q15_t *buffer_a) |
Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. More... | |
int32_t | arm_depthwise_conv_s8_opt_get_buffer_size (const uint16_t input_ch, const uint16_t kernel_x, const uint16_t kernel_y) |
Get the required buffer size for optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. More... | |
arm_status | arm_fully_connected_q7 (const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q7_t *pOut, q15_t *vec_buffer) |
Q7 basic fully-connected layer function. More... | |
arm_status | arm_fully_connected_s8 (const int8_t *pInput, const int8_t *pWeight, const uint16_t col_dim, const uint16_t row_dim, const uint16_t nb_batches, const int32_t input_offset, const int32_t filter_offset, const int32_t out_mult, const int32_t out_shift, const int32_t output_offset, const int32_t *pBias, int8_t *pOut, const int32_t output_activation_min, const int32_t output_activation_max, q15_t *vec_buffer) |
S8 basic fully-connected and matrix multiplication layer function for TF Lite. More... | |
int32_t | arm_fully_connected_s8_get_buffer_size (const uint16_t col_dim) |
Get the required buffer size for S8 basic fully-connected and matrix multiplication layer function for TF Lite. More... | |
arm_status | arm_fully_connected_q7_opt (const q7_t *pV, const q7_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q7_t *pOut, q15_t *vec_buffer) |
Q7 opt fully-connected layer function. More... | |
arm_status | arm_fully_connected_q15 (const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q15_t *bias, q15_t *pOut, q15_t *vec_buffer) |
Q15 basic fully-connected layer function. More... | |
arm_status | arm_fully_connected_q15_opt (const q15_t *pV, const q15_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q15_t *bias, q15_t *pOut, q15_t *vec_buffer) |
Q15 opt fully-connected layer function. More... | |
arm_status | arm_fully_connected_mat_q7_vec_q15 (const q15_t *pV, const q7_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q15_t *pOut, q15_t *vec_buffer) |
Mixed Q15-Q7 fully-connected layer function. More... | |
arm_status | arm_fully_connected_mat_q7_vec_q15_opt (const q15_t *pV, const q7_t *pM, const uint16_t dim_vec, const uint16_t num_of_rows, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q15_t *pOut, q15_t *vec_buffer) |
Mixed Q15-Q7 opt fully-connected layer function. More... | |
q7_t * | arm_nn_mat_mult_kernel_q7_q15 (const q7_t *pA, const q15_t *pInBuffer, const uint16_t ch_im_out, const uint16_t numCol_A, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q7_t *pOut) |
Matrix-Multiplication Kernels for Convolution. More... | |
q7_t * | arm_nn_mat_mult_kernel_s8_s16 (const q7_t *input_a, const q15_t *input_b, const uint16_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int16_t activation_min, const int16_t activation_max, const uint16_t num_col_a, const int32_t *const output_bias, q7_t *out_0) |
Matrix-multiplication function for convolution with per-channel requantization. More... | |
q7_t * | arm_nn_mat_mult_kernel_s8_s16_reordered (const q7_t *input_a, const q15_t *input_b, const uint16_t output_ch, const int32_t *out_shift, const int32_t *out_mult, const int32_t out_offset, const int16_t activation_min, const int16_t activation_max, const uint16_t num_col_a, const int32_t *const output_bias, q7_t *out_0) |
Matrix-multiplication of re-ordered input B with A. More... | |
q7_t * | arm_nn_mat_mult_kernel_q7_q15_reordered (const q7_t *pA, const q15_t *pInBuffer, const uint16_t ch_im_out, const uint16_t numCol_A, const uint16_t bias_shift, const uint16_t out_shift, const q7_t *bias, q7_t *pOut) |
Matrix-multiplication function for convolution with reordered columns. More... | |
arm_status | arm_elementwise_add_s8 (const int8_t *input_1_vect, const int8_t *input_2_vect, const int32_t input_1_offset, const int32_t input_1_mult, const int32_t input_1_shift, const int32_t input_2_offset, const int32_t input_2_mult, const int32_t input_2_shift, const int32_t left_shift, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t out_activation_min, const int32_t out_activation_max, const uint32_t block_size) |
s8 element wise add of two vectors More... | |
arm_status | arm_elementwise_mul_s8 (const int8_t *input_1_vect, const int8_t *input_2_vect, const int32_t input_1_offset, const int32_t input_2_offset, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t out_activation_min, const int32_t out_activation_max, const uint32_t block_size) |
s8 element wise multiplication More... | |
void | arm_relu_q7 (q7_t *data, uint16_t size) |
Q7 RELU function. More... | |
void | arm_relu6_s8 (q7_t *data, uint16_t size) |
s8 ReLU6 function More... | |
void | arm_relu_q15 (q15_t *data, uint16_t size) |
Q15 RELU function. More... | |
void | arm_nn_activations_direct_q7 (q7_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type) |
Q7 neural network activation function using direct table look-up. More... | |
void | arm_nn_activations_direct_q15 (q15_t *data, uint16_t size, uint16_t int_width, arm_nn_activation_type type) |
Q15 neural network activation function using direct table look-up. More... | |
void | arm_maxpool_q7_HWC (q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, q7_t *Im_out) |
Q7 max pooling function. More... | |
void | arm_avepool_q7_HWC (q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const uint16_t dim_im_out, q7_t *bufferA, q7_t *Im_out) |
Q7 average pooling function. More... | |
void | arm_avgpool_s8 (const int dim_src_height, const int dim_src_width, const int dim_dst_height, const int dim_dst_width, const int stride_height, const int stride_width, const int dim_kernel_height, const int dim_kernel_width, const int padding_height, const int padding_width, const int act_min, const int act_max, const int ch_src, int8_t *src, int16_t *bufferA, int8_t *dst) |
s8 average pooling function More... | |
int32_t | arm_avgpool_s8_get_buffer_size (const int dim_dst_width, const int ch_src) |
Get the required buffer size for S8 average pooling function. More... | |
void | arm_max_pool_s8_opt (const uint16_t input_y, const uint16_t input_x, const uint16_t output_y, const uint16_t output_x, const uint16_t stride_y, const uint16_t stride_x, const uint16_t kernel_y, const uint16_t kernel_x, const uint16_t pad_y, const uint16_t pad_x, const int8_t act_min, const int8_t act_max, const uint16_t depth, int8_t *input, int16_t *tmp_buffer, int8_t *output) |
s8 DSP optimized max pooling function More... | |
void | arm_max_pool_s8 (const uint16_t input_y, const uint16_t input_x, const uint16_t output_y, const uint16_t output_x, const uint16_t stride_y, const uint16_t stride_x, const uint16_t kernel_y, const uint16_t kernel_x, const uint16_t pad_y, const uint16_t pad_x, const int8_t act_min, const int8_t act_max, const uint16_t channel_in, int8_t *input, int16_t *tmp_buffer, int8_t *output) |
s8 pure C max pooling function More... | |
void | arm_softmax_q7 (const q7_t *vec_in, const uint16_t dim_vec, q7_t *p_out) |
Q7 softmax function. More... | |
void | arm_softmax_with_batch_q7 (const q7_t *vec_in, const uint16_t nb_batches, const uint16_t dim_vec, q7_t *p_out) |
Q7 softmax function with batch parameter. More... | |
void | arm_softmax_q15 (const q15_t *vec_in, const uint16_t dim_vec, q15_t *p_out) |
Q15 softmax function. More... | |
void | arm_softmax_s8 (const int8_t *input, const int32_t num_rows, const int32_t row_size, const int32_t mult, const int32_t shift, const int8_t diff_min, int8_t *output) |
S8 softmax function. More... | |
arm_status | arm_depthwise_conv_u8_basic_ver1 (const uint8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint8_t *kernel, const uint16_t kernel_x, const uint16_t kernel_y, const int16_t ch_mult, const int16_t pad_x, const int16_t pad_y, const int16_t stride_x, const int16_t stride_y, const int16_t dilation_x, const int16_t dilation_y, const int32_t *bias, const int32_t input_offset, const int32_t filter_offset, const int32_t output_offset, uint8_t *output, const uint16_t output_x, const uint16_t output_y, const int32_t output_activation_min, const int32_t output_activation_max, const int32_t out_shift, const int32_t out_mult) |
uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier and input channels. Unless specified otherwise, arguments are mandatory. More... | |
void | arm_reshape_s8 (const int8_t *input, int8_t *output, const uint32_t total_size) |
Reshape a s8 vector into another with different shape. More... | |
void | arm_concatenation_s8_x (const int8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_z, const uint16_t input_w, int8_t *output, const uint16_t output_x, const uint32_t offset_x) |
int8/uint8 concatenation function to be used for concatenating N-tensors along the X axis This function should be called for each input tensor to concatenate. The argument offset_x will be used to store the input tensor in the correct position in the output tensor More... | |
void | arm_concatenation_s8_y (const int8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_z, const uint16_t input_w, int8_t *output, const uint16_t output_y, const uint32_t offset_y) |
int8/uint8 concatenation function to be used for concatenating N-tensors along the Y axis This function should be called for each input tensor to concatenate. The argument offset_y will be used to store the input tensor in the correct position in the output tensor More... | |
void | arm_concatenation_s8_z (const int8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_z, const uint16_t input_w, int8_t *output, const uint16_t output_z, const uint32_t offset_z) |
int8/uint8 concatenation function to be used for concatenating N-tensors along the Z axis This function should be called for each input tensor to concatenate. The argument offset_z will be used to store the input tensor in the correct position in the output tensor More... | |
void | arm_concatenation_s8_w (const int8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_z, const uint16_t input_w, int8_t *output, const uint32_t offset_w) |
int8/uint8 concatenation function to be used for concatenating N-tensors along the W axis (Batch size) This function should be called for each input tensor to concatenate. The argument offset_w will be used to store the input tensor in the correct position in the output tensor More... | |
#define USE_INTRINSIC |
q7_t* arm_nn_mat_mult_kernel_q7_q15 | ( | const q7_t * | pA, |
const q15_t * | pInBuffer, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | numCol_A, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
const q7_t * | bias, | ||
q7_t * | pOut | ||
) |
These functions are used within convolution layer functions for matrix multiplication.
The implementation is similar to CMSIS-DSP arm_mat_mult functions with one Q7 and one Q15 operands. The Q15 operand is the im2col output which is always with 2 columns. Matrix-multiplication function for convolution
[in] | pA | pointer to operand A |
[in] | pInBuffer | pointer to operand B, always conssists of 2 vectors |
[in] | ch_im_out | numRow of A |
[in] | numCol_A | numCol of A |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in] | bias | the bias |
[in,out] | pOut | pointer to output |
Matrix-Multiplication Kernels for Convolution.
Refer to header file for details.
References NN_ROUND.
Referenced by arm_convolve_HWC_q7_basic(), arm_convolve_HWC_q7_basic_nonsquare(), and arm_convolve_HWC_q7_RGB().
q7_t* arm_nn_mat_mult_kernel_q7_q15_reordered | ( | const q7_t * | pA, |
const q15_t * | pInBuffer, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | numCol_A, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
const q7_t * | bias, | ||
q7_t * | pOut | ||
) |
[in] | pA | pointer to operand A |
[in] | pInBuffer | pointer to operand B, always conssists of 2 vectors |
[in] | ch_im_out | numRow of A |
[in] | numCol_A | numCol of A |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in] | bias | the bias |
[in,out] | pOut | pointer to output |
This function assumes that data in pInBuffer are reordered
Matrix-multiplication function for convolution with reordered columns.
Refer to header file for details.
References NN_ROUND.
Referenced by arm_convolve_1x1_HWC_q7_fast_nonsquare(), arm_convolve_HWC_q7_fast(), and arm_convolve_HWC_q7_fast_nonsquare().
q7_t* arm_nn_mat_mult_kernel_s8_s16 | ( | const q7_t * | input_a, |
const q15_t * | input_b, | ||
const uint16_t | output_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int16_t | activation_min, | ||
const int16_t | activation_max, | ||
const uint16_t | num_col_a, | ||
const int32_t *const | output_bias, | ||
q7_t * | out_0 | ||
) |
[in] | input_a | pointer to operand A |
[in] | input_b | pointer to operand B, always consists of 2 vectors. |
[in] | output_ch | number of rows of A |
[in] | out_shift | pointer to per output channel requantization shift parameter. |
[in] | out_mult | pointer to per output channel requantization multiplier parameter. |
[in] | out_offset | output tensor offset. |
[in] | activation_min | minimum value to clamp the output to. Range : int8 |
[in] | activation_max | maximum value to clamp the output to. Range : int8 |
[in] | num_col_a | number of columns of A |
[in] | output_bias | per output channel bias. Range : int32 |
[in,out] | out_0 | pointer to output |
This function does the matrix multiplication of weight matrix for all output channels with 2 columns from im2col and produces two elements/output_channel. The outputs are clamped in the range provided by activation min and max. Supported framework: TensorFlow Lite micro.
References arm_nn_read_q15x2_ia(), arm_nn_requantize(), MAX, and MIN.
Referenced by arm_convolve_s8().
q7_t* arm_nn_mat_mult_kernel_s8_s16_reordered | ( | const q7_t * | input_a, |
const q15_t * | input_b, | ||
const uint16_t | output_ch, | ||
const int32_t * | out_shift, | ||
const int32_t * | out_mult, | ||
const int32_t | out_offset, | ||
const int16_t | activation_min, | ||
const int16_t | activation_max, | ||
const uint16_t | num_col_a, | ||
const int32_t *const | output_bias, | ||
q7_t * | out_0 | ||
) |
For arguments, refer arm_nn_mat_mult_kernel_s8_s16. The re-ordering is a consequence of sign extension done by the SXTB16 command on input_b. The outputs are clamped in the range provided by activation min and max.
References arm_nn_read_q15x2_ia(), arm_nn_requantize(), MAX, and MIN.
Referenced by arm_convolve_1x1_s8_fast().