<< 14/22 >>
First Last

matmul + add

Eigen の matmul はそのまま
足し算の部分は ParallelForkJoin で parallel_add という関数を並列実行しているみたい

 ; Function Attrs: nounwind
 define void @cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.v8(i8* nocapture align 8 dereferenceable(8) %retval, i8* noalias %run_options, i8** noalias nocapture readonly %params, i8** noalias %temps, i64* noalias %prof_counters) local_unnamed_addr #1 {
 entry:
   %parallel_add_parameter_addresses1 = alloca [2 x i8*], align 8
   %parallel_add_parameter_addresses1.sub = getelementptr inbounds [2 x i8*], [2 x i8*]* %parallel_add_parameter_addresses1, i64 0, i64 0
   %0 = getelementptr inbounds i8*, i8** %params, i64 2
   %1 = bitcast i8** %0 to float**
   %arg2.untyped23 = load float*, float** %1, align 8, !invariant.load !7, !dereferenceable !0, !align !1
   %2 = getelementptr inbounds i8*, i8** %params, i64 1
   %3 = bitcast i8** %2 to float**
   %arg1.untyped45 = load float*, float** %3, align 8, !invariant.load !7, !dereferenceable !0, !align !1
   %4 = bitcast i8** %params to i64*
   %arg0.untyped6 = load i64, i64* %4, align 8, !invariant.load !7
   %5 = getelementptr inbounds i8*, i8** %temps, i64 5
   %6 = load i8*, i8** %5, align 8, !invariant.load !7, !dereferenceable !8, !align !1
   %7 = bitcast i8* %6 to float*
   tail call void @__xla_cpu_runtime_EigenMatMulF32(i8* %run_options, float* %7, float* %arg1.untyped45, float* %arg2.untyped23, i64 1024, i64 1024, i64 1024, i32 0, i32 0)
   %8 = load i8*, i8** %temps, align 8, !invariant.load !7, !dereferenceable !0, !align !1
   store i8* %6, i8** %parallel_add_parameter_addresses1.sub, align 8
   %9 = getelementptr inbounds [2 x i8*], [2 x i8*]* %parallel_add_parameter_addresses1, i64 0, i64 1
   %10 = bitcast i8** %9 to i64*
   store i64 %arg0.untyped6, i64* %10, align 8
   call void @__xla_cpu_runtime_ParallelForkJoin(i8* %8, i8* %run_options, i8** nonnull %parallel_add_parameter_addresses1.sub, i8** %temps, i64* %prof_counters, i32 3, i64* getelementptr inbounds ([6 x i64], [6 x i64]* @parallel_add_parallel_dimension_partitions, i64 0, i64 0), i32 1, i8* bitcast (void (i8*, i8*, i8**, i8**, i64*, i64*)* @parallel_add to i8*))
   %11 = bitcast i8* %retval to i8**
   store i8* %8, i8** %11, align 8, !alias.scope !9, !noalias !2
   ret void
 }