-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfloat32mul.c
136 lines (120 loc) · 5 KB
/
float32mul.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>
#include <math.h>
#include <arm_neon.h>
#include <time.h>
#define BLOCK_SIZE 8
void matrix_multiply_c(float32_t *A, float32_t *B, float32_t *C, uint32_t n, uint32_t m, uint32_t k) {
clock_t t;
t = clock();
for (int i_idx=0; i_idx<n; i_idx++) {
for (int j_idx=0; j_idx<m; j_idx++) {
C[n*j_idx + i_idx] = 0;
for (int k_idx=0; k_idx<k; k_idx++) {
C[n*j_idx + i_idx] += A[n*k_idx + i_idx]*B[k*j_idx + k_idx];
}
}
}
t = clock() - t;
double time_taken = ((double)t)/CLOCKS_PER_SEC; // calculate the elapsed time
printf("The program taken for normal mul is %f seconds to execute", time_taken);
}
void matrix_multiply_neon(float32_t *A, float32_t *B, float32_t *C, uint32_t n, uint32_t m, uint32_t k) {
int A_idx;
int B_idx;
int C_idx;
float32x4_t A0;
float32x4_t A1;
float32x4_t A2;
float32x4_t A3;
float32x4_t B0;
float32x4_t B1;
float32x4_t B2;
float32x4_t B3;
float32x4_t C0;
float32x4_t C1;
float32x4_t C2;
float32x4_t C3;
clock_t t;
t = clock();
for (int i_idx=0; i_idx<n; i_idx+=4) {
for (int j_idx=0; j_idx<m; j_idx+=4) {
C0 = vmovq_n_f32(0);
C1 = vmovq_n_f32(0);
C2 = vmovq_n_f32(0);
C3 = vmovq_n_f32(0);
for (int k_idx=0; k_idx<k; k_idx+=4) {
A_idx = i_idx + n*k_idx;
B_idx = k*j_idx + k_idx;
A0 = vld1q_f32(A+A_idx);
A1 = vld1q_f32(A+A_idx+n);
A2 = vld1q_f32(A+A_idx+2*n);
A3 = vld1q_f32(A+A_idx+3*n);
B0 = vld1q_f32(B+B_idx);
C0 = vfmaq_laneq_f32(C0, A0, B0, 0);
C0 = vfmaq_laneq_f32(C0, A1, B0, 1);
C0 = vfmaq_laneq_f32(C0, A2, B0, 2);
C0 = vfmaq_laneq_f32(C0, A3, B0, 3);
B1 = vld1q_f32(B+B_idx+k);
C1 = vfmaq_laneq_f32(C1, A0, B1, 0);
C1 = vfmaq_laneq_f32(C1, A1, B1, 1);
C1 = vfmaq_laneq_f32(C1, A2, B1, 2);
C1 = vfmaq_laneq_f32(C1, A3, B1, 3);
B2 = vld1q_f32(B+B_idx+2*k);
C2 = vfmaq_laneq_f32(C2, A0, B2, 0);
C2 = vfmaq_laneq_f32(C2, A1, B2, 1);
C2 = vfmaq_laneq_f32(C2, A2, B2, 2);
C2 = vfmaq_laneq_f32(C2, A3, B2, 3);
B3 = vld1q_f32(B+B_idx+3*k);
C3 = vfmaq_laneq_f32(C3, A0, B3, 0);
C3 = vfmaq_laneq_f32(C3, A1, B3, 1);
C3 = vfmaq_laneq_f32(C3, A2, B3, 2);
C3 = vfmaq_laneq_f32(C3, A3, B3, 3);
}
C_idx = n*j_idx + i_idx;
vst1q_f32(C+C_idx, C0);
vst1q_f32(C+C_idx+n, C1);
vst1q_f32(C+C_idx+2*n, C2);
vst1q_f32(C+C_idx+3*n, C3);
}
}
t = clock() - t;
double time_taken = ((double)t)/CLOCKS_PER_SEC; // calculate the elapsed time
printf("The program taken for Neon mul is %f seconds to execute", time_taken);
}
void print_matrix(float32_t *M, uint32_t cols, uint32_t rows) {
for (int i=0; i<rows; i++) {
for (int j=0; j<cols; j++) {
printf("%f ", M[j*rows + i]);
}
printf("\n");
}
printf("\n");
}
void matrix_init_rand(float32_t *M, uint32_t numvals) {
for (int i=0; i<numvals; i++) {
M[i] = (float)rand()/(float)(RAND_MAX);
}
}
int main() {
uint32_t n = 2*BLOCK_SIZE; // rows in A
uint32_t m = 2*BLOCK_SIZE; // cols in B
uint32_t k = 2*BLOCK_SIZE; // cols in a and rows in b
static float32_t A[256]; // A[BLOCK_SIZE*4]
static float32_t B[256];
static float32_t D[256];
static float32_t E[256];
matrix_init_rand(A, n*k);
matrix_init_rand(B, k*m);
print_matrix(A, k, n);
print_matrix(B, m, k);
matrix_multiply_c(A, B, E, n, m, k);
printf("normal mul\n");
print_matrix(E, n, m);
matrix_multiply_neon(A, B, D, n, m, k);
printf("Neon\n");
print_matrix(D, n, m);
return 0;
}