CMSIS-NN
Version 1.2.0
CMSIS NN Software Library
|
Macros | |
#define | DIM_KER_X |
#define | DIM_KER_Y |
Functions | |
arm_status | arm_convolve_1x1_HWC_q7_fast_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 version of 1x1 convolution (non-sqaure shape) More... | |
arm_status | arm_convolve_1x1_s8_fast (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint16_t input_batches, const q7_t *kernel, const uint16_t output_ch, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t input_offset, const int32_t out_activation_min, const int32_t out_activation_max, const uint16_t output_x, const uint16_t output_y, q15_t *buffer_a) |
Fast s8 version for 1x1 convolution (non-square shape) More... | |
int32_t | arm_convolve_1x1_s8_fast_get_buffer_size (const uint16_t input_ch) |
Get the required buffer size for the fast 1x1 convolution (non-square shape) s8 convolution function. More... | |
arm_status | arm_convolve_HWC_q15_basic (const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Basic Q15 convolution function. More... | |
arm_status | arm_convolve_HWC_q15_fast (const q15_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Fast Q15 convolution function. More... | |
arm_status | arm_convolve_HWC_q15_fast_nonsquare (const q15_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q15_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q15_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q15_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Fast Q15 convolution function (non-sqaure shape) More... | |
arm_status | arm_convolve_HWC_q7_basic (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Basic Q7 convolution function. More... | |
arm_status | arm_convolve_HWC_q7_basic_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Basic Q7 convolution function (non-sqaure shape) More... | |
arm_status | arm_convolve_HWC_q7_fast (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 convolution function. More... | |
arm_status | arm_convolve_HWC_q7_fast_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Fast Q7 convolution function (non-sqaure shape) More... | |
arm_status | arm_convolve_HWC_q7_RGB (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Q7 convolution function for RGB image. More... | |
arm_status | arm_convolve_s8 (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint16_t input_batches, const q7_t *kernel, const uint16_t output_ch, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const int32_t out_offset, const int32_t input_offset, const int32_t out_activation_min, const int32_t out_activation_max, const uint16_t output_x, const uint16_t output_y, q15_t *buffer_a) |
Basic s8 convolution function. More... | |
int32_t | arm_convolve_s8_get_buffer_size (const uint16_t input_ch, const uint16_t kernel_x, const uint16_t kernel_y) |
Get the required buffer size for s8 convolution function. More... | |
arm_status | arm_depthwise_conv_s8 (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const q7_t *kernel, const uint16_t output_ch, const uint16_t ch_mult, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y, q15_t *buffer_a) |
Basic s8 depthwise convolution function. More... | |
arm_status | arm_depthwise_conv_s8_opt (const q7_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const q7_t *kernel, const uint16_t output_ch, const uint16_t kernel_x, const uint16_t kernel_y, const uint16_t pad_x, const uint16_t pad_y, const uint16_t stride_x, const uint16_t stride_y, const int32_t *bias, q7_t *output, const int32_t *output_shift, const int32_t *output_mult, const uint16_t output_x, const uint16_t output_y, const int32_t output_offset, const int32_t input_offset, const int32_t output_activation_min, const int32_t output_activation_max, const uint16_t dilation_x, const uint16_t dilation_y, q15_t *buffer_a) |
Optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. More... | |
int32_t | arm_depthwise_conv_s8_opt_get_buffer_size (const uint16_t input_ch, const uint16_t kernel_x, const uint16_t kernel_y) |
Get the required buffer size for optimized s8 depthwise convolution function with constraint that in_channel equals out_channel. More... | |
arm_status | arm_depthwise_conv_u8_basic_ver1 (const uint8_t *input, const uint16_t input_x, const uint16_t input_y, const uint16_t input_ch, const uint8_t *kernel, const uint16_t kernel_x, const uint16_t kernel_y, const int16_t ch_mult, const int16_t pad_x, const int16_t pad_y, const int16_t stride_x, const int16_t stride_y, const int16_t dilation_x, const int16_t dilation_y, const int32_t *bias, const int32_t input_offset, const int32_t filter_offset, const int32_t output_offset, uint8_t *output, const uint16_t output_x, const uint16_t output_y, const int32_t output_activation_min, const int32_t output_activation_max, const int32_t out_shift, const int32_t out_mult) |
uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier and input channels. Unless specified otherwise, arguments are mandatory. Both square and non-square inputs are accepted. More... | |
arm_status | arm_depthwise_separable_conv_HWC_q7 (const q7_t *Im_in, const uint16_t dim_im_in, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel, const uint16_t padding, const uint16_t stride, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out, q15_t *bufferA, q7_t *bufferB) |
Q7 depthwise separable convolution function. More... | |
arm_status | arm_depthwise_separable_conv_HWC_q7_nonsquare (const q7_t *Im_in, const uint16_t dim_im_in_x, const uint16_t dim_im_in_y, const uint16_t ch_im_in, const q7_t *wt, const uint16_t ch_im_out, const uint16_t dim_kernel_x, const uint16_t dim_kernel_y, const uint16_t padding_x, const uint16_t padding_y, const uint16_t stride_x, const uint16_t stride_y, const q7_t *bias, const uint16_t bias_shift, const uint16_t out_shift, q7_t *Im_out, const uint16_t dim_im_out_x, const uint16_t dim_im_out_y, q15_t *bufferA, q7_t *bufferB) |
Q7 depthwise separable convolution function (non-square shape) More... | |
Perform convolution layer
The convolution is implemented in 2 steps: im2col and GEMM
im2col is a process of converting each patch of image data into a column. After im2col, the convolution is computed as matrix-matrix multiplication.
To reduce the memory footprint, the im2col is performed partially. Each iteration, only a few column (i.e., patches) are generated and computed with GEMM kernels similar to CMSIS-DSP arm_mat_mult functions.
#define DIM_KER_X |
Referenced by arm_convolve_1x1_s8_fast().
#define DIM_KER_Y |
Referenced by arm_convolve_1x1_s8_fast().
arm_status arm_convolve_1x1_HWC_q7_fast_nonsquare | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in_x, | ||
const uint16_t | dim_im_in_y, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel_x, | ||
const uint16_t | dim_kernel_y, | ||
const uint16_t | padding_x, | ||
const uint16_t | padding_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out_x, | ||
const uint16_t | dim_im_out_y, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in_x | input tensor dimention x |
[in] | dim_im_in_y | input tensor dimention y |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel_x | filter kernel size x |
[in] | dim_kernel_y | filter kernel size y |
[in] | padding_x | padding size x |
[in] | padding_y | padding size y |
[in] | stride_x | convolution stride x |
[in] | stride_y | convolution stride y |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out_x | output tensor dimension x |
[in] | dim_im_out_y | output tensor dimension y |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SIZE_MISMATCH
or ARM_MATH_SUCCESS
based on the outcome of size checking.This function is optimized for convolution with 1x1 kernel size (i.e., dim_kernel_x=1 and dim_kernel_y=1). It can be used for the second half of MobileNets [1] after depthwise separable convolution.
This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2
[1] MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861
References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.
arm_status arm_convolve_1x1_s8_fast | ( | const q7_t * | input, |
const uint16_t | input_x, | ||
const uint16_t | input_y, | ||
const uint16_t | input_ch, | ||
const uint16_t | input_batches, | ||
const q7_t * | kernel, | ||
const uint16_t | output_ch, | ||
const uint16_t | pad_x, | ||
const uint16_t | pad_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const int32_t * | bias, | ||
q7_t * | output, | ||
const int32_t * | output_shift, | ||
const int32_t * | output_mult, | ||
const int32_t | out_offset, | ||
const int32_t | input_offset, | ||
const int32_t | out_activation_min, | ||
const int32_t | out_activation_max, | ||
const uint16_t | output_x, | ||
const uint16_t | output_y, | ||
q15_t * | buffer_a | ||
) |
[in] | input | pointer to input tensor. Format: [N, H, W, in_ch] |
[in] | input_x | input tensor dimension x |
[in] | input_y | input tensor dimension y |
[in] | input_ch | number of input tensor channels |
[in] | input_batches | number of input batches |
[in] | kernel | pointer to kernel weights. Format: [out_ch, H, W, in_ch] |
[in] | output_ch | number of filters, i.e., output tensor channels |
[in] | pad_x | padding size x |
[in] | pad_y | padding size y |
[in] | stride_x | convolution stride x |
[in] | stride_y | convolution stride y |
[in] | bias | pointer to per channel bias. Range : int32 |
[in,out] | output | pointer to output tensor. Format: [H, W, out_ch] |
[in] | output_shift | pointer to per output channel requantization shift parameter. |
[in] | output_mult | pointer to per output channel requantization multiplier parameter. |
[in] | out_offset | output tensor offset. Range: int8 |
[in] | input_offset | input tensor offset. Range: int8 |
[in] | out_activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | out_activation_max | Minimum value to clamp the output to. Range: int8 |
[in] | output_x | output tensor width |
[in] | output_y | output tensor height |
[in] | buffer_a | pointer to buffer space used for input optimization(partial im2col) and is necessary when ARM_MATH_LOOPUNROLL and ARM_MATH_DSP is defined. Required space: 2 * input_ch * sizeof(q15_t) bytes Use arm_convolve_1x1_s8_fast_get_buffer_size() to get the size |
ARM_MATH_SIZE_MISMATCH
if argument constraints fail. or, ARM_MATH_SUCCESS
on successful completion.References arm_convolve_s8(), arm_nn_mat_mult_kernel_s8_s16_reordered(), arm_nn_read_q15x2_ia(), arm_nn_requantize(), arm_q7_to_q15_reordered_with_offset(), DIM_KER_X, DIM_KER_Y, MAX, and MIN.
int32_t arm_convolve_1x1_s8_fast_get_buffer_size | ( | const uint16_t | input_ch | ) |
[in] | input_ch | number of input tensor channels |
arm_status arm_convolve_HWC_q15_basic | ( | const q15_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q15_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q15_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q15_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in | input tensor dimention |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel | filter kernel size |
[in] | padding | padding sizes |
[in] | stride | convolution stride |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out | output tensor dimension |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SUCCESS
Buffer size:
bufferA size: ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
This basic version is designed to work for any input tensor and weight dimension.
References NN_ROUND.
arm_status arm_convolve_HWC_q15_fast | ( | const q15_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q15_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q15_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q15_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in | input tensor dimention |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel | filter kernel size |
[in] | padding | padding sizes |
[in] | stride | convolution stride |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out | output tensor dimension |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SIZE_MISMATCH
or ARM_MATH_SUCCESS
based on the outcome of size checking.Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in is multiple of 2
ch_im_out is multipe of 2
References NN_ROUND.
arm_status arm_convolve_HWC_q15_fast_nonsquare | ( | const q15_t * | Im_in, |
const uint16_t | dim_im_in_x, | ||
const uint16_t | dim_im_in_y, | ||
const uint16_t | ch_im_in, | ||
const q15_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel_x, | ||
const uint16_t | dim_kernel_y, | ||
const uint16_t | padding_x, | ||
const uint16_t | padding_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const q15_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q15_t * | Im_out, | ||
const uint16_t | dim_im_out_x, | ||
const uint16_t | dim_im_out_y, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in_x | input tensor dimention x |
[in] | dim_im_in_y | input tensor dimention y |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel_x | filter kernel size x |
[in] | dim_kernel_y | filter kernel size y |
[in] | padding_x | padding size x |
[in] | padding_y | padding size y |
[in] | stride_x | convolution stride x |
[in] | stride_y | convolution stride y |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out_x | output tensor dimension x |
[in] | dim_im_out_y | output tensor dimension y |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SIZE_MISMATCH
or ARM_MATH_SUCCESS
based on the outcome of size checking.Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in is multiple of 2
ch_im_out is multipe of 2
References NN_ROUND.
arm_status arm_convolve_HWC_q7_basic | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in | input tensor dimention |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel | filter kernel size |
[in] | padding | padding sizes |
[in] | stride | convolution stride |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out | output tensor dimension |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SUCCESS
Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
This basic version is designed to work for any input tensor and weight dimension.
References arm_nn_mat_mult_kernel_q7_q15(), arm_q7_to_q15_no_shift(), and NN_ROUND.
arm_status arm_convolve_HWC_q7_basic_nonsquare | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in_x, | ||
const uint16_t | dim_im_in_y, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel_x, | ||
const uint16_t | dim_kernel_y, | ||
const uint16_t | padding_x, | ||
const uint16_t | padding_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out_x, | ||
const uint16_t | dim_im_out_y, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Basic Q7 convolution function (non-square shape)
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in_x | input tensor dimention x |
[in] | dim_im_in_y | input tensor dimention y |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel_x | filter kernel size x |
[in] | dim_kernel_y | filter kernel size y |
[in] | padding_x | padding size x |
[in] | padding_y | padding size y |
[in] | stride_x | convolution stride x |
[in] | stride_y | convolution stride y |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out_x | output tensor dimension x |
[in] | dim_im_out_y | output tensor dimension y |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SUCCESS
References arm_nn_mat_mult_kernel_q7_q15(), arm_q7_to_q15_no_shift(), and NN_ROUND.
arm_status arm_convolve_HWC_q7_fast | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in | input tensor dimention |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel | filter kernel size |
[in] | padding | padding sizes |
[in] | stride | convolution stride |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out | output tensor dimension |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SIZE_MISMATCH
or ARM_MATH_SUCCESS
based on the outcome of size checking.Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in is multiple of 4 ( because of the SIMD32 read and swap )
ch_im_out is multipe of 2 ( bacause 2x2 mat_mult kernel )
The im2col converts the Q7 tensor input into Q15 column, which is stored in bufferA. There is reordering happenning during this im2col process with arm_q7_to_q15_reordered_no_shift. For every four elements, the second and third elements are swapped.
The computation kernel arm_nn_mat_mult_kernel_q7_q15_reordered does the GEMM computation with the reordered columns.
To speed-up the determination of the padding condition, we split the computation into 3x3 parts, i.e., {top, mid, bottom} X {left, mid, right}. This reduces the total number of boundary condition checks and improves the data copying performance.
References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.
Referenced by main().
arm_status arm_convolve_HWC_q7_fast_nonsquare | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in_x, | ||
const uint16_t | dim_im_in_y, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel_x, | ||
const uint16_t | dim_kernel_y, | ||
const uint16_t | padding_x, | ||
const uint16_t | padding_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out_x, | ||
const uint16_t | dim_im_out_y, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in_x | input tensor dimention x |
[in] | dim_im_in_y | input tensor dimention y |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel_x | filter kernel size x |
[in] | dim_kernel_y | filter kernel size y |
[in] | padding_x | padding size x |
[in] | padding_y | padding size y |
[in] | stride_x | convolution stride x |
[in] | stride_y | convolution stride y |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out_x | output tensor dimension x |
[in] | dim_im_out_y | output tensor dimension y |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SIZE_MISMATCH
or ARM_MATH_SUCCESS
based on the outcome of size checking.This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is multiple of 4 ch_im_out is multiple of 2
References arm_nn_mat_mult_kernel_q7_q15_reordered(), arm_q7_to_q15_reordered_no_shift(), and NN_ROUND.
arm_status arm_convolve_HWC_q7_RGB | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
Q7 version of convolution for RGB image.
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in | input tensor dimention |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel | filter kernel size |
[in] | padding | padding sizes |
[in] | stride | convolution stride |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out | output tensor dimension |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SIZE_MISMATCH
or ARM_MATH_SUCCESS
based on the outcome of size checking.Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in equals 3
This kernel is written exclusively for convolution with ch_im_in equals 3. This applies on the first layer of CNNs which has input image with RGB format.
References arm_nn_mat_mult_kernel_q7_q15(), arm_nnword::half_words, NN_ROUND, and arm_nnword::word.
Referenced by main().
arm_status arm_convolve_s8 | ( | const q7_t * | input, |
const uint16_t | input_x, | ||
const uint16_t | input_y, | ||
const uint16_t | input_ch, | ||
const uint16_t | input_batches, | ||
const q7_t * | kernel, | ||
const uint16_t | output_ch, | ||
const uint16_t | kernel_x, | ||
const uint16_t | kernel_y, | ||
const uint16_t | pad_x, | ||
const uint16_t | pad_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const int32_t * | bias, | ||
q7_t * | output, | ||
const int32_t * | output_shift, | ||
const int32_t * | output_mult, | ||
const int32_t | out_offset, | ||
const int32_t | input_offset, | ||
const int32_t | output_activation_min, | ||
const int32_t | output_activation_max, | ||
const uint16_t | output_x, | ||
const uint16_t | output_y, | ||
q15_t * | buffer_a | ||
) |
[in] | input | pointer to input tensor. Range: int8, format: [N,H,W,in_ch] |
[in] | input_x | input tensor width |
[in] | input_y | input tensor height |
[in] | input_ch | number of input tensor channels |
[in] | input_batches | number of input batches |
[in] | kernel | pointer to kernel weights. Range: int8, format: [out_ch, H, W, in_ch] |
[in] | output_ch | number of filters, i.e., output tensor channels |
[in] | kernel_x | filter/kernel width |
[in] | kernel_y | filter/kernel height |
[in] | pad_x | padding along width |
[in] | pad_y | padding along height |
[in] | stride_x | convolution stride x |
[in] | stride_y | convolution stride y |
[in] | bias | pointer to per output channel bias. Range: int32 |
[in,out] | output | pointer to output tensor. format: [H, W, out_ch] |
[in] | output_shift | pointer to per output channel requantization shift parameter. |
[in] | output_mult | pointer to per output channel requantization multiplier parameter. |
[in] | out_offset | output tensor offset. Range: int8 |
[in] | input_offset | input tensor offset. Range: int8 |
[in] | output_activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | output_activation_max | Minimum value to clamp the output to. Range: int8 |
[in] | output_x | output tensor width |
[in] | output_y | output tensor height |
[in] | buffer_a | pointer to buffer space used for input optimization(partial im2col) and is necessary when both ARM_MATH_LOOPUNROLL and ARM_MATH_DSP are defined. Required space: (2 * input_ch * kernel_x * kernel_y) * sizeof(q15_t) bytes Use arm_convolve_s8_get_buffer_size() to get the size. |
ARM_MATH_SUCCESS
References arm_nn_mat_mult_kernel_s8_s16(), arm_nn_read_q15x2_ia(), arm_nn_requantize(), arm_q7_to_q15_with_offset(), MAX, and MIN.
Referenced by arm_convolve_1x1_s8_fast().
int32_t arm_convolve_s8_get_buffer_size | ( | const uint16_t | input_ch, |
const uint16_t | kernel_x, | ||
const uint16_t | kernel_y | ||
) |
[in] | input_ch | number of input tensor channels |
[in] | kernel_x | filter/kernel width |
[in] | kernel_y | filter/kernel height |
arm_status arm_depthwise_conv_s8 | ( | const q7_t * | input, |
const uint16_t | input_x, | ||
const uint16_t | input_y, | ||
const uint16_t | input_ch, | ||
const q7_t * | kernel, | ||
const uint16_t | output_ch, | ||
const uint16_t | ch_mult, | ||
const uint16_t | kernel_x, | ||
const uint16_t | kernel_y, | ||
const uint16_t | pad_x, | ||
const uint16_t | pad_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const int32_t * | bias, | ||
q7_t * | output, | ||
const int32_t * | output_shift, | ||
const int32_t * | output_mult, | ||
const uint16_t | output_x, | ||
const uint16_t | output_y, | ||
const int32_t | output_offset, | ||
const int32_t | input_offset, | ||
const int32_t | output_activation_min, | ||
const int32_t | output_activation_max, | ||
const uint16_t | dilation_x, | ||
const uint16_t | dilation_y, | ||
q15_t * | buffer_a | ||
) |
[in] | input | pointer to input tensor. Range: int8, format: [H,W,in_ch] |
[in] | input_x | input tensor width |
[in] | input_y | input tensor height |
[in] | input_ch | number of input tensor channels |
[in] | kernel | pointer to kernel weights. Range: int8, format: [in_ch, H, W, out_ch] |
[in] | output_ch | Number of output channels. output_ch = ch_mult * input_ch |
[in] | ch_mult | channel multiplier. |
[in] | kernel_x | filter/kernel width |
[in] | kernel_y | filter/kernel height |
[in] | pad_x | padding along width |
[in] | pad_y | padding along height |
[in] | stride_x | convolution stride along width |
[in] | stride_y | convolution stride along height |
[in] | bias | pointer to per output channel bias. Range: int32 |
[in,out] | output | pointer to output tensor. Format: [H, W, out_ch] |
[in] | output_shift | pointer to per output channel requantization shift parameter. |
[in] | output_mult | pointer to per output channel requantization multiplier parameter. |
[in] | output_x | output tensor width |
[in] | output_y | output tensor height |
[in] | output_offset | offset to elements of output tensor |
[in] | input_offset | offset to elements of input tensor |
[in] | output_activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | output_activation_max | Minimum value to clamp the output to. Range: int8 |
[in] | dilation_x | dilation along x. Not used. Dilation factor of 1 is used. |
[in] | dilation_y | dilation along y. Not used. Dilation factor of 1 is used. |
[in] | buffer_a | Not used. |
ARM_MATH_SUCCESS
References arm_nn_requantize(), MAX, and MIN.
Referenced by arm_depthwise_conv_s8_opt().
arm_status arm_depthwise_conv_s8_opt | ( | const q7_t * | input, |
const uint16_t | input_x, | ||
const uint16_t | input_y, | ||
const uint16_t | input_ch, | ||
const q7_t * | kernel, | ||
const uint16_t | output_ch, | ||
const uint16_t | kernel_x, | ||
const uint16_t | kernel_y, | ||
const uint16_t | pad_x, | ||
const uint16_t | pad_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const int32_t * | bias, | ||
q7_t * | output, | ||
const int32_t * | output_shift, | ||
const int32_t * | output_mult, | ||
const uint16_t | output_x, | ||
const uint16_t | output_y, | ||
const int32_t | output_offset, | ||
const int32_t | input_offset, | ||
const int32_t | output_activation_min, | ||
const int32_t | output_activation_max, | ||
const uint16_t | dilation_x, | ||
const uint16_t | dilation_y, | ||
q15_t * | buffer_a | ||
) |
[in] | input | pointer to input tensor. Range: int8, format: [H,W,in_ch] |
[in] | input_x | input tensor width |
[in] | input_y | input tensor height |
[in] | input_ch | number of input tensor channels |
[in] | kernel | pointer to kernel weights. Range: int8, Format: [in_ch, H, W, out_ch] |
[in] | output_ch | Number of output channels. |
[in] | kernel_x | filter/kernel width |
[in] | kernel_y | filter/kernel height |
[in] | pad_x | padding along width |
[in] | pad_y | padding along height |
[in] | stride_x | convolution stride along width |
[in] | stride_y | convolution stride along height |
[in] | bias | pointer to per output channel bias. Range: int8 |
[in,out] | output | pointer to output tensor. Format: [H, W, out_ch] |
[in] | output_shift | pointer to per output channel requantization shift parameter. |
[in] | output_mult | pointer to per output channel requantization multiplier parameter. |
[in] | output_x | output tensor width |
[in] | output_y | output tensor height |
[in] | output_offset | offset to elements of output tensor |
[in] | input_offset | offset to elements of input tensor |
[in] | output_activation_min | Minimum value to clamp the output to. Range: int8 |
[in] | output_activation_max | Minimum value to clamp the output to. Range: int8 |
[in] | dilation_x | dilation along x. Not used. Dilation factor of 1 is used. |
[in] | dilation_y | dilation along y. Not used. Dilation factor of 1 is used. |
[in] | buffer_a | Buffer for partial im2col optimization. This is mandatory when ARM_MATH_LOOPUNROLL and ARM_MATH_DSP are defined. Required space: (2 * input_ch * kernel_x * kernel_y) * sizeof(q15_t) bytes Use arm_depthwise_conv_s8_opt_get_buffer_size() to get the size. |
ARM_MATH_SIZE_MISMATCH
- Unsupported dimension of tensors ARM_MATH_SUCCESS
- Successful operationReferences arm_depthwise_conv_s8(), arm_nn_read_q15x2(), arm_nn_read_q7x4(), arm_nn_requantize(), arm_q7_to_q15_with_offset(), col_buffer, MAX, and MIN.
int32_t arm_depthwise_conv_s8_opt_get_buffer_size | ( | const uint16_t | input_ch, |
const uint16_t | kernel_x, | ||
const uint16_t | kernel_y | ||
) |
[in] | input_ch | number of input tensor channels |
[in] | kernel_x | filter/kernel width |
[in] | kernel_y | filter/kernel height |
arm_status arm_depthwise_conv_u8_basic_ver1 | ( | const uint8_t * | input, |
const uint16_t | input_x, | ||
const uint16_t | input_y, | ||
const uint16_t | input_ch, | ||
const uint8_t * | kernel, | ||
const uint16_t | kernel_x, | ||
const uint16_t | kernel_y, | ||
const int16_t | ch_mult, | ||
const int16_t | pad_x, | ||
const int16_t | pad_y, | ||
const int16_t | stride_x, | ||
const int16_t | stride_y, | ||
const int16_t | dilation_x, | ||
const int16_t | dilation_y, | ||
const int32_t * | bias, | ||
const int32_t | input_offset, | ||
const int32_t | filter_offset, | ||
const int32_t | output_offset, | ||
uint8_t * | output, | ||
const uint16_t | output_x, | ||
const uint16_t | output_y, | ||
const int32_t | output_activation_min, | ||
const int32_t | output_activation_max, | ||
const int32_t | out_shift, | ||
const int32_t | out_mult | ||
) |
uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier and input channels. Unless specified otherwise, arguments are mandatory.
[in] | input | Pointer to input tensor |
[in] | input_x | Width of input tensor |
[in] | input_y | Height of input tensor |
[in] | input_ch | Channels in input tensor |
[in] | kernel | Pointer to kernel weights |
[in] | kernel_x | Width of kernel |
[in] | kernel_y | Height of kernel |
[in] | ch_mult | Number of channel multiplier |
[in] | pad_x | Padding sizes x |
[in] | pad_y | Padding sizes y |
[in] | stride_x | Convolution stride along the width |
[in] | stride_y | Convolution stride along the height |
[in] | dilation_x | Dilation along width. Not used and intended for future enhancement. |
[in] | dilation_y | Dilation along height. Not used and intended for future enhancement. |
[in] | bias | Pointer to optional bias values. If no bias is availble, NULL is expected |
[in] | input_offset | Input tensor zero offset |
[in] | filter_offset | Kernel tensor zero offset |
[in] | output_offset | Output tensor zero offset |
[in,out] | output | Pointer to output tensor |
[in] | output_x | Width of output tensor |
[in] | output_y | Height of output tensor |
[in] | output_activation_min | Minimum value to clamp the output to. Range : {0, 255} |
[in] | output_activation_max | Minimum value to clamp the output to. Range : {0, 255} |
[in] | out_shift | Amount of right-shift for output |
[in] | out_mult | Output multiplier for requantization |
ARM_MATH_SIZE_MISMATCH
- Not supported dimension of tensors ARM_MATH_SUCCESS
- Successful operation ARM_MATH_ARGUMENT_ERROR
- Implementation not availableInput constraints ch_mult is multiple of 2 kernel_x is multiple of 2
References arm_nn_divide_by_power_of_two(), arm_nn_sat_doubling_high_mult(), DILATION_X, DILATION_Y, LEFT_SHIFT, and RIGHT_SHIFT.
arm_status arm_depthwise_separable_conv_HWC_q7 | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel, | ||
const uint16_t | padding, | ||
const uint16_t | stride, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in | input tensor dimention |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel | filter kernel size |
[in] | padding | padding sizes |
[in] | stride | convolution stride |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out | output tensor dimension |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SIZE_MISMATCH
or ARM_MATH_SUCCESS
based on the outcome of size checking.Buffer size:
bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
bufferB size: 0
Input dimension constraints:
ch_im_in equals ch_im_out
Implementation: There are 3 nested loop here: Inner loop: calculate each output value with MAC instruction over an accumulator Mid loop: loop over different output channel Outer loop: loop over different output (x, y)
References arm_nnword::bytes, NN_ROUND, and arm_nnword::word.
arm_status arm_depthwise_separable_conv_HWC_q7_nonsquare | ( | const q7_t * | Im_in, |
const uint16_t | dim_im_in_x, | ||
const uint16_t | dim_im_in_y, | ||
const uint16_t | ch_im_in, | ||
const q7_t * | wt, | ||
const uint16_t | ch_im_out, | ||
const uint16_t | dim_kernel_x, | ||
const uint16_t | dim_kernel_y, | ||
const uint16_t | padding_x, | ||
const uint16_t | padding_y, | ||
const uint16_t | stride_x, | ||
const uint16_t | stride_y, | ||
const q7_t * | bias, | ||
const uint16_t | bias_shift, | ||
const uint16_t | out_shift, | ||
q7_t * | Im_out, | ||
const uint16_t | dim_im_out_x, | ||
const uint16_t | dim_im_out_y, | ||
q15_t * | bufferA, | ||
q7_t * | bufferB | ||
) |
[in] | Im_in | pointer to input tensor |
[in] | dim_im_in_x | input tensor dimention x |
[in] | dim_im_in_y | input tensor dimention y |
[in] | ch_im_in | number of input tensor channels |
[in] | wt | pointer to kernel weights |
[in] | ch_im_out | number of filters, i.e., output tensor channels |
[in] | dim_kernel_x | filter kernel size x |
[in] | dim_kernel_y | filter kernel size y |
[in] | padding_x | padding sizes x |
[in] | padding_y | padding sizes y |
[in] | stride_x | convolution stride x |
[in] | stride_y | convolution stride y |
[in] | bias | pointer to bias |
[in] | bias_shift | amount of left-shift for bias |
[in] | out_shift | amount of right-shift for output |
[in,out] | Im_out | pointer to output tensor |
[in] | dim_im_out_x | output tensor dimension x |
[in] | dim_im_out_y | output tensor dimension y |
[in,out] | bufferA | pointer to buffer space for input |
[in,out] | bufferB | pointer to buffer space for output |
ARM_MATH_SIZE_MISMATCH
or ARM_MATH_SUCCESS
based on the outcome of size checking.This function is the version with full list of optimization tricks, but with some contraints: ch_im_in is equal to ch_im_out
References arm_nnword::bytes, NN_ROUND, and arm_nnword::word.