| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194 |
- // single: clang -O2 -march=native gemm.c
- // multi: clang -O2 -march=native gemm.c -DNTHREADS=32 -lpthread
- #define _GNU_SOURCE
- // https://en.wikichip.org/wiki/amd/microarchitectures/zen_2
- #include <stdint.h>
- #include <time.h>
- #include <sched.h>
- #include <stdio.h>
- #include <assert.h>
- #include <math.h>
- #include <string.h>
- #include <immintrin.h>
- #include <pthread.h>
- #include <unistd.h>
- #include <stdatomic.h>
- //#define DEBUG
- #ifdef DEBUG
- #define N 8
- #endif
- #ifndef N
- // NOTE: if you change this you have to rerun gemm.py
- #define N 512
- #endif
- #ifndef NTHREADS
- #define NTHREADS 1
- #endif
- // aligned?
- float A[N*N] __attribute__ ((aligned (64)));
- float B[N*N] __attribute__ ((aligned (64)));
- float C[N*N] __attribute__ ((aligned (64)));
- float val[N*N] __attribute__ ((aligned (64)));
- __m256 *Am = (__m256*)A;
- __m256 *Bm = (__m256*)B;
- __m256 *Cm = (__m256*)C;
- uint64_t nanos() {
- struct timespec start;
- clock_gettime(CLOCK_MONOTONIC_RAW, &start);
- return (uint64_t)start.tv_sec*1000000000 + (uint64_t)start.tv_nsec;
- }
- float Bf[N*N] __attribute__ ((aligned (64)));
- __m256 *Bfm = (__m256*)Bf;
- #define BLOCK 8
- #define BLOCK_Y 4
- #define BLOCK_X 2
- void matmul(int sy, int ey) {
- // 136.77 GFLOPS on single core numpy
- // 4.9 GHz is max boost for 5950X
- // 32 FLOPS/cycle (16 FMAs, aka 2x 8 single wide / 32 byte FMAs)
- // theoretical max is 156.8 GFLOPS, we see 150
- // multicore theo max = 2508.8 GFLOPS, we see 1501.434299
- // Bf = (y/8, k, 8)
- for (int y = sy; y < ey; y+=BLOCK_Y) {
- for (int x = 0; x < N; x+=BLOCK*BLOCK_X) {
- __m256 acc[BLOCK_Y][BLOCK_X] = {};
- for (int k = 0; k < N; k++) {
- for (int iy = 0; iy < BLOCK_Y; iy++) {
- __m256 ta = _mm256_broadcast_ss(&A[(y+iy)*N + k]);
- for (int ix = 0; ix < BLOCK_X; ix++) {
- acc[iy][ix] = _mm256_fmadd_ps(ta, Bfm[((x+ix*BLOCK)*N + k*8)/8], acc[iy][ix]);
- }
- }
- }
- for (int iy = 0; iy < BLOCK_Y; iy++) {
- for (int ix = 0; ix < BLOCK_X; ix++) {
- Cm[((y+iy)*N + x + ix * BLOCK)/8] = acc[iy][ix];
- }
- }
- }
- }
- }
- pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
- atomic_int nready = 0;
- atomic_int ndone = 0;
- void *matmul_thread(void *n) {
- int k = (int)(int64_t)n;
- int sy = (N/NTHREADS) * k;
- int ey = (N/NTHREADS) * (k+1);
- cpu_set_t set;
- CPU_ZERO(&set);
- CPU_SET(k,&set);
- pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &set);
- nready++;
- // gotta have main lock once to signal start
- pthread_mutex_lock(&lock);
- pthread_mutex_unlock(&lock);
- matmul(sy, ey);
- // we done
- ndone++;
- return NULL;
- }
- int main() {
- printf("hello with %d threads\n", NTHREADS);
- #ifdef DEBUG
- for (int i = 0; i < N*N; i++) A[i] = i;
- for (int i = 0; i < N*N; i++) B[i] = i;
- #else
- FILE *f = fopen("/tmp/matmul", "rb");
- if (f == NULL) {
- printf("please pregenerate python /tmp/matmul file\n");
- return -1;
- }
- fread(A, 1, sizeof(float)*N*N, f);
- fread(B, 1, sizeof(float)*N*N, f);
- fread(val, 1, sizeof(float)*N*N, f);
- fclose(f);
- #endif
- // preswizzle
- for (int y = 0; y < N; y+=8) {
- for (int x = 0; x < N; x++) {
- for (int iy = 0; iy < 8; iy++) {
- Bf[y*N + x*8 + iy] = B[(y+iy)*N + x];
- }
- }
- }
- for (int i = 0; i < 10; i++) {
- memset(C, 0, N*N*sizeof(float));
- #if NTHREADS != 1
- nready = 0;
- ndone = 0;
- pthread_mutex_lock(&lock);
- pthread_t threads[NTHREADS];
- for (int j = 0; j < NTHREADS; j++) {
- pthread_create(&threads[j], NULL, matmul_thread, (void *)(uint64_t)j);
- }
- while (nready != NTHREADS) usleep(1);
- #endif
- uint64_t start = nanos();
- #if NTHREADS == 1
- matmul(0, N);
- #else
- // unlocking mutex starts threads
- pthread_mutex_unlock(&lock);
- while (ndone != NTHREADS) usleep(1);
- #endif
- uint64_t end = nanos();
- #if NTHREADS != 1
- for (int j = 0; j < NTHREADS; j++) {
- pthread_join(threads[j], NULL);
- }
- #endif
- double gflop = (2.0*N*N*N)*1e-9;
- double s = (end-start)*1e-9;
- printf("%f GFLOP/S -- %.2f ms\n", gflop/s, s*1e3);
- // hack around throttling
- //if (i%4 == 0) sleep(1);
- }
- #ifdef DEBUG
- for (int i = 0; i < N*N; i++) {
- if (i%N == 0 && i != 0) printf("\n");
- printf("%f ", C[i]);
- }
- printf("\n");
- #else
- for (int k = 0; k < N*N; k++) {
- if (fabsf(C[k] - val[k]) > 1e-3) {
- printf("MISMATCH AT %d, %f != %f\n", k, C[k], val[k]);
- return -1;
- }
- }
- printf("match\n");
- #endif
- return 0;
- }
|