Eigen の matmul はそのまま
足し算の部分は ParallelForkJoin で parallel_add という関数を並列実行しているみたい
; Function Attrs: nounwind define void @cluster_0__XlaCompiledKernel_true__XlaNumConstantArgs_0__XlaNumResourceArgs_0_.v8(i8* nocapture align 8 dereferenceable(8) %retval, i8* noalias %run_options, i8** noalias nocapture readonly %params, i8** noalias %temps, i64* noalias %prof_counters) local_unnamed_addr #1 { entry: %parallel_add_parameter_addresses1 = alloca [2 x i8*], align 8 %parallel_add_parameter_addresses1.sub = getelementptr inbounds [2 x i8*], [2 x i8*]* %parallel_add_parameter_addresses1, i64 0, i64 0 %0 = getelementptr inbounds i8*, i8** %params, i64 2 %1 = bitcast i8** %0 to float** %arg2.untyped23 = load float*, float** %1, align 8, !invariant.load !7, !dereferenceable !0, !align !1 %2 = getelementptr inbounds i8*, i8** %params, i64 1 %3 = bitcast i8** %2 to float** %arg1.untyped45 = load float*, float** %3, align 8, !invariant.load !7, !dereferenceable !0, !align !1 %4 = bitcast i8** %params to i64* %arg0.untyped6 = load i64, i64* %4, align 8, !invariant.load !7 %5 = getelementptr inbounds i8*, i8** %temps, i64 5 %6 = load i8*, i8** %5, align 8, !invariant.load !7, !dereferenceable !8, !align !1 %7 = bitcast i8* %6 to float* tail call void @__xla_cpu_runtime_EigenMatMulF32(i8* %run_options, float* %7, float* %arg1.untyped45, float* %arg2.untyped23, i64 1024, i64 1024, i64 1024, i32 0, i32 0) %8 = load i8*, i8** %temps, align 8, !invariant.load !7, !dereferenceable !0, !align !1 store i8* %6, i8** %parallel_add_parameter_addresses1.sub, align 8 %9 = getelementptr inbounds [2 x i8*], [2 x i8*]* %parallel_add_parameter_addresses1, i64 0, i64 1 %10 = bitcast i8** %9 to i64* store i64 %arg0.untyped6, i64* %10, align 8 call void @__xla_cpu_runtime_ParallelForkJoin(i8* %8, i8* %run_options, i8** nonnull %parallel_add_parameter_addresses1.sub, i8** %temps, i64* %prof_counters, i32 3, i64* getelementptr inbounds ([6 x i64], [6 x i64]* @parallel_add_parallel_dimension_partitions, i64 0, i64 0), i32 1, i8* bitcast (void (i8*, i8*, i8**, i8**, i64*, i64*)* @parallel_add to i8*)) %11 = bitcast i8* %retval to i8** store i8* %8, i8** %11, align 8, !alias.scope !9, !noalias !2 ret void }