<< 15/22 >>
First Last

matmul + add (parallel_add)

ループ一回で 8*8 要素ずつこなしている感じぽい
SIMDの8とアンロール

 define internal void @parallel_add(i8* nocapture align 16 dereferenceable(4194304) %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readonly %params, i8** noalias nocapture readnone %temps, i64* noalias nocapture readonly %dynamic_loop_bounds, i64* noalias nocapture readnone %prof_counters) #0 {
 entry:
   %0 = bitcast i8** %params to [1024 x [1024 x float]]**
   %name.untyped2 = load [1024 x [1024 x float]]*, [1024 x [1024 x float]]** %0, align 8, !dereferenceable !0, !align !1
   %1 = getelementptr inbounds i8*, i8** %params, i64 1
   %2 = bitcast i8** %1 to [1024 x [1024 x float]]**
   %name.1.untyped3 = load [1024 x [1024 x float]]*, [1024 x [1024 x float]]** %2, align 8, !dereferenceable !0, !align !1
   %add.clone = bitcast i8* %retval to [1024 x [1024 x float]]*
   %3 = load i64, i64* %dynamic_loop_bounds, align 8
   %dynamic_loop_bound_1 = getelementptr i64, i64* %dynamic_loop_bounds, i64 1
   %4 = load i64, i64* %dynamic_loop_bound_1, align 8
   %5 = icmp ult i64 %3, %4
   br i1 %5, label %vector.ph.preheader, label %add.clone.loop_exit.dim.0
 
 vector.ph.preheader:                              ; preds = %entry
   br label %vector.ph
 
 vector.ph:                                        ; preds = %vector.ph.preheader, %add.clone.loop_exit.dim.1
   %add.clone.invar_address.dim.0.05 = phi i64 [ %invar.inc, %add.clone.loop_exit.dim.1 ], [ %3, %vector.ph.preheader ]
   br label %vector.body
 
 vector.body:                                      ; preds = %vector.body, %vector.ph
   %index = phi i64 [ 0, %vector.ph ], [ %index.next.1, %vector.body ]
   %6 = getelementptr inbounds [1024 x [1024 x float]], [1024 x [1024 x float]]* %name.untyped2, i64 0, i64 %add.clone.invar_address.dim.0.05, i64 %index
   %7 = bitcast float* %6 to <8 x float>*
   %wide.load = load <8 x float>, <8 x float>* %7, align 16, !noalias !2
   %8 = getelementptr float, float* %6, i64 8
   %9 = bitcast float* %8 to <8 x float>*
   %wide.load10 = load <8 x float>, <8 x float>* %9, align 16, !noalias !2
   %10 = getelementptr float, float* %6, i64 16
   %11 = bitcast float* %10 to <8 x float>*
   %wide.load11 = load <8 x float>, <8 x float>* %11, align 16, !noalias !2
   %12 = getelementptr float, float* %6, i64 24
   %13 = bitcast float* %12 to <8 x float>*
   %wide.load12 = load <8 x float>, <8 x float>* %13, align 16, !noalias !2
   %14 = getelementptr inbounds [1024 x [1024 x float]], [1024 x [1024 x float]]* %name.1.untyped3, i64 0, i64 %add.clone.invar_address.dim.0.05, i64 %index
   %15 = bitcast float* %14 to <8 x float>*
   %wide.load13 = load <8 x float>, <8 x float>* %15, align 16, !noalias !2
   %16 = getelementptr float, float* %14, i64 8
   %17 = bitcast float* %16 to <8 x float>*
   %wide.load14 = load <8 x float>, <8 x float>* %17, align 16, !noalias !2
   %18 = getelementptr float, float* %14, i64 16
   %19 = bitcast float* %18 to <8 x float>*
   %wide.load15 = load <8 x float>, <8 x float>* %19, align 16, !noalias !2
   %20 = getelementptr float, float* %14, i64 24
   %21 = bitcast float* %20 to <8 x float>*
   %wide.load16 = load <8 x float>, <8 x float>* %21, align 16, !noalias !2
   %22 = fadd fast <8 x float> %wide.load13, %wide.load
   %23 = fadd fast <8 x float> %wide.load14, %wide.load10
   %24 = fadd fast <8 x float> %wide.load15, %wide.load11
   %25 = fadd fast <8 x float> %wide.load16, %wide.load12
   %26 = getelementptr inbounds [1024 x [1024 x float]], [1024 x [1024 x float]]* %add.clone, i64 0, i64 %add.clone.invar_address.dim.0.05, i64 %index
   %27 = bitcast float* %26 to <8 x float>*
   store <8 x float> %22, <8 x float>* %27, align 16, !alias.scope !2
   %28 = getelementptr float, float* %26, i64 8
   %29 = bitcast float* %28 to <8 x float>*
   store <8 x float> %23, <8 x float>* %29, align 16, !alias.scope !2
   %30 = getelementptr float, float* %26, i64 16
   %31 = bitcast float* %30 to <8 x float>*
   store <8 x float> %24, <8 x float>* %31, align 16, !alias.scope !2
   %32 = getelementptr float, float* %26, i64 24
   %33 = bitcast float* %32 to <8 x float>*
   store <8 x float> %25, <8 x float>* %33, align 16, !alias.scope !2
   %index.next = or i64 %index, 32
   %34 = getelementptr inbounds [1024 x [1024 x float]], [1024 x [1024 x float]]* %name.untyped2, i64 0, i64 %add.clone.invar_address.dim.0.05, i64 %index.next
   %35 = bitcast float* %34 to <8 x float>*
   %wide.load.1 = load <8 x float>, <8 x float>* %35, align 16, !noalias !2
   %36 = getelementptr float, float* %34, i64 8
   %37 = bitcast float* %36 to <8 x float>*
   %wide.load10.1 = load <8 x float>, <8 x float>* %37, align 16, !noalias !2
   %38 = getelementptr float, float* %34, i64 16
   %39 = bitcast float* %38 to <8 x float>*
   %wide.load11.1 = load <8 x float>, <8 x float>* %39, align 16, !noalias !2
   %40 = getelementptr float, float* %34, i64 24
   %41 = bitcast float* %40 to <8 x float>*
   %wide.load12.1 = load <8 x float>, <8 x float>* %41, align 16, !noalias !2
   %42 = getelementptr inbounds [1024 x [1024 x float]], [1024 x [1024 x float]]* %name.1.untyped3, i64 0, i64 %add.clone.invar_address.dim.0.05, i64 %index.next
   %43 = bitcast float* %42 to <8 x float>*
   %wide.load13.1 = load <8 x float>, <8 x float>* %43, align 16, !noalias !2
   %44 = getelementptr float, float* %42, i64 8
   %45 = bitcast float* %44 to <8 x float>*
   %wide.load14.1 = load <8 x float>, <8 x float>* %45, align 16, !noalias !2
   %46 = getelementptr float, float* %42, i64 16
   %47 = bitcast float* %46 to <8 x float>*
   %wide.load15.1 = load <8 x float>, <8 x float>* %47, align 16, !noalias !2
   %48 = getelementptr float, float* %42, i64 24
   %49 = bitcast float* %48 to <8 x float>*
   %wide.load16.1 = load <8 x float>, <8 x float>* %49, align 16, !noalias !2
   %50 = fadd fast <8 x float> %wide.load13.1, %wide.load.1
   %51 = fadd fast <8 x float> %wide.load14.1, %wide.load10.1
   %52 = fadd fast <8 x float> %wide.load15.1, %wide.load11.1
   %53 = fadd fast <8 x float> %wide.load16.1, %wide.load12.1
   %54 = getelementptr inbounds [1024 x [1024 x float]], [1024 x [1024 x float]]* %add.clone, i64 0, i64 %add.clone.invar_address.dim.0.05, i64 %index.next
   %55 = bitcast float* %54 to <8 x float>*
   store <8 x float> %50, <8 x float>* %55, align 16, !alias.scope !2
   %56 = getelementptr float, float* %54, i64 8
   %57 = bitcast float* %56 to <8 x float>*
   store <8 x float> %51, <8 x float>* %57, align 16, !alias.scope !2
   %58 = getelementptr float, float* %54, i64 16
   %59 = bitcast float* %58 to <8 x float>*
   store <8 x float> %52, <8 x float>* %59, align 16, !alias.scope !2
   %60 = getelementptr float, float* %54, i64 24
   %61 = bitcast float* %60 to <8 x float>*
   store <8 x float> %53, <8 x float>* %61, align 16, !alias.scope !2
   %index.next.1 = add nuw nsw i64 %index, 64
   %62 = icmp eq i64 %index.next.1, 1024
   br i1 %62, label %add.clone.loop_exit.dim.1, label %vector.body, !llvm.loop !5