Disassembly analysis for a simple vectorized loop

This is my post on analyzing the AVX2 disassemblies generated for a simple 20 double vectorized multiplication and add loop.

The source code of the loop can be found here

https://github.com/yunmingzhang17/ligra/blob/master/apps/GD.C#L36

https://github.com/narayanan2004/GraphMat/blob/master/src/SGD.cpp#L122

Gradient Descent Assembly Code for GraphMat  (single threaded)

     │         while (i < A_DCSC->nzx) {                                                                     

      │           if (get_bitvector(A_DCSC->xindex[i], x.bitvector)) {                                        

      │             int end = (i == A_DCSC->nzx – 1 ? A_DCSC->nnz : A_DCSC->starty[i + 1]);                   

 0.00 │       sub    %r13d,%r12d                                                                              

      │             T  p = x.getValue(A_DCSC->xindex[i]);                                                     

      │             int j = A_DCSC->starty[i];                                                                

      │             //comment out the prefetching                                                             

      │             //_mm_prefetch((char*)(x.value + A_DCSC->xindex[i+4]), _MM_HINT_T0);                      

      │             for (; j < end; j++) {                                                                    

      │               gp->process_message(p, A_DCSC->value[j], Vertexproperty[A_DCSC->yindex[j]], res[0]);    

 0.01 │1b2:   mov    0x28(%r8),%rdi                                                                           

 0.06 │       mov    0x30(%r8),%rsi                                                                           

 0.82 │       mov    (%rdi,%r13,4),%edi                                                                       

 0.39 │       movslq %edi,%r10                                                                                

 0.01 │       imul   $0xa8,%r10,%r11                                                                          

      │       void process_message(const LatentVector<K>& message, const int edge_val,                        

      │                             const LatentVector<K>& vertexprop, LatentVector<K>& res) const {          ◆

      │         //res = message * edge_val;                                                                   

      │         double estimate = 0;                                                                          

      │         for (int i = 0; i < K; i++) {                                                                 

      │           estimate += message.lv[i]*vertexprop.lv[i];                                                 

( load  4*4 = 16 doubles, supposingly destination vertex latent vector (vertexprop.lv), most likely in fast cache. )

Note for myself, it is 4 doubles, because the offset 0x2a0, 0x2c0 is different by 32 bytes in binary. What this instruction is loading the content at (%rsp + 0x2a0)                                                             

 0.80 │       vmovup 0x2a0(%rsp),%ymm7         

 0.84 │       vmovup 0x2c0(%rsp),%ymm6                                                                        

 0.52 │       vmovup 0x2e0(%rsp),%ymm4                                                                        

 0.19 │       vmovup 0x300(%rsp),%ymm1                

(load  4*4 = 16 doubles, supposingly remote vertex latent vector, mostly likely NOT in fast cache because the random pattern can not be predicted)                                                      

11.48 │       vmovup (%r11,%r9,1),%xmm8                                                                       

 4.08 │       vmovup 0x20(%r11,%r9,1),%xmm10                                                                  

 3.73 │       vmovup 0x40(%r11,%r9,1),%xmm12                                                                  

 2.64 │       vmovup 0x60(%r11,%r9,1),%xmm14       

                                                          

(one last load for the last 4 doubles in the destination latent vector (vertexprop.lv), we know it is the last 4 bytes because the offset is 0x320, 32 bytes from 0x300, the offset of the last latent vector )

 0.24 │       vmovup 0x320(%rsp),%ymm0      (this move is fast because the cache line is already fetched by the first 4 vmovup instructions)                                                              

(getting ready to do the vector multiplication instructions. I am not sure what vinser do? I can’t find documentation on that )

 1.59 │       vinser $0x1,0x10(%r11,%r9,1),%ymm8,%ymm9                                                        

 1.15 │       vinser $0x1,0x30(%r11,%r9,1),%ymm10,%ymm11                                                      

 1.48 │       vinser $0x1,0x50(%r11,%r9,1),%ymm12,%ymm13                                                      

 2.02 │       vinser $0x1,0x70(%r11,%r9,1),%ymm14,%ymm15

(actually doing the vector multiplcation, we needed 5 to multiply together 20 doubles)                                                     

 1.14 │       vmulpd %ymm9,%ymm7,%ymm5                                                                        

 0.87 │       vmulpd %ymm11,%ymm6,%ymm2                                                                       

 0.95 │       vmulpd %ymm13,%ymm4,%ymm3                                                                       

 1.49 │       vmulpd %ymm15,%ymm1,%ymm10                                                                      

 0.79 │       vmovup 0x80(%r11,%r9,1),%xmm8      (this move is fast because the cache line is already fetched by the first 4 vmovup instructions)

     │       //void process_message(const ConstLatentVectorPtr<K>& message, const int edge_val,              

      │       void process_message(const LatentVector<K>& message, const int edge_val,                        

      │                             const LatentVector<K>& vertexprop, LatentVector<K>& res) const {          

      │         //res = message * edge_val;                                                                   

      │         double estimate = 0;    

(the add instructions in the end, there are 4 adds to sum up 20 doubles)                                                                      

 0.95 │       vaddpd %ymm2,%ymm5,%ymm2                                                                        

 1.64 │       vaddpd %ymm10,%ymm3,%ymm3                                                                       

 2.73 │       vaddpd %ymm3,%ymm2,%ymm5                                                                        

      │         for (int i = 0; i < K; i++) {                                                                 

      │           estimate += message.lv[i]*vertexprop.lv[i];                                                 

 1.09 │       vinser $0x1,0x90(%r11,%r9,1),%ymm8,%ymm9                                                        

 0.67 │       vmulpd %ymm9,%ymm0,%ymm11                                                                       

      │                                                                                                       

      │       //void process_message(const ConstLatentVectorPtr<K>& message, const int edge_val,              

      │       void process_message(const LatentVector<K>& message, const int edge_val,                        

      │                             const LatentVector<K>& vertexprop, LatentVector<K>& res) const {          

      │         //res = message * edge_val;                                                                   

      │         double estimate = 0;                                                                          

 2.94 │       vaddpd %ymm11,%ymm5,%ymm12                                                                      

      │         for (int i = 0; i < K; i++) {                                                                 

      │           estimate += message.lv[i]*vertexprop.lv[i];                                                 

      │         }                                                                                             

      │         double error = edge_val – estimate;                                                           

 0.05 │       vxorpd %xmm3,%xmm3,%xmm3                                                                        

 0.43 │       vcvtsi (%rsi,%r13,4),%xmm3,%xmm3                                                                

      │         }                                                                                             

      │     }                                           

Gradient Descent Assembly Code for Ligra  WITH -xHost (single threaded)

     │                                                                                                       

      │         double * __restrict __attribute ((align_value(32) ))  cur_error = &error[current_offset];     

 0.04 │       mov    error,%r9                                                                                

      │         double * __restrict __attribute ((align_value(32) ))  cur_latent =  &latent_curr[current_offse

      │         double * __restrict __attribute ((align_value(32) )) ngh_latent = &latent_curr[ngh_offset];   

      │                                                                                                       

      │         for(int i = 0; i < K; i++){                                                                   

      │           //estimate += latent_curr[current_offset + i]*latent_curr[ngh_offset + i];                  

      │           estimate +=  cur_latent[i]*ngh_latent[i];  

( load  4*4 = 16 doubles, supposingly destination vertex latent vector (vertexprop.lv), most likely in fast cache. )

                                                

 0.08 │       vmovup (%r8,%rax,8),%xmm3                                                                       

 0.10 │       vmovup 0x20(%r8,%rax,8),%xmm7                                                                   

 0.97 │       vmovup 0x40(%r8,%rax,8),%xmm11                                                                  

 0.23 │       vmovup 0x60(%r8,%rax,8),%xmm15                                                                  

      │         double estimate = 0;                                                                          

      │         int current_offset = K*d;                                                                     

      │         int ngh_offset = K*s;                                                                         

      │                                                                                                       

      │         double * __restrict __attribute ((align_value(32) ))  cur_latent =  &latent_curr[current_offse

      │         double * __restrict __attribute ((align_value(32) )) ngh_latent = &latent_curr[ngh_offset];   

 0.05 │       lea    (%r8,%rcx,8),%rdi                                                                        

      │                                                                                                       

      │         for(int i = 0; i < K; i++){                                                                   

      │           //estimate += latent_curr[current_offset + i]*latent_curr[ngh_offset + i];                  

      │           estimate +=  cur_latent[i]*ngh_latent[i];         

(load  4*4 = 16 doubles, supposingly remote vertex latent vector, mostly likely NOT in fast cache because the random pattern can not be predicted)

                                         

(This first few unpredictable load is unusally expensive, in fact it is much more expensive than the graphMat’s version)

23.27 │       vmovup (%rdi),%xmm4                                                                             

 5.71 │       vmovup 0x20(%rdi),%xmm8                                                                         

 9.00 │       vmovup 0x40(%rdi),%xmm12                                                                        

      │     #ifdef DEBUG                                                                                      

      │         cout << “rating: ” << edgeLen << endl;                                                        

      │         cout << “estimate: ” << estimate << endl;                                                     

      │     #endif                                                                                            

      │                                                                                                       

      │         double * __restrict __attribute ((align_value(32) ))  cur_error = &error[current_offset];     

 0.03 │       lea    (%r9,%rax,8),%rsi                                                                        

      │         double * __restrict __attribute ((align_value(32) ))  cur_latent =  &latent_curr[current_offse

      │         double * __restrict __attribute ((align_value(32) )) ngh_latent = &latent_curr[ngh_offset];

      │         for(int i = 0; i < K; i++){                                                                   

      │           //estimate += latent_curr[current_offset + i]*latent_curr[ngh_offset + i];                  

      │           estimate +=  cur_latent[i]*ngh_latent[i];                                                   

 0.49 │       vinser $0x1,0x10(%r8,%rax,8),%ymm3,%ymm5                                                        

 9.19 │       vmovup 0x60(%rdi),%xmm3      

(doing one multiplication early on before finished loading the remote cache line)                                                                   

 0.41 │       vinser $0x1,0x10(%rdi),%ymm4,%ymm6                                                              

 1.19 │       vmulpd %ymm6,%ymm5,%ymm2                                                                        

 0.39 │       vmovup 0x80(%r8,%rax,8),%xmm6                                                                   

 0.02 │       vinser $0x1,0x50(%r8,%rax,8),%ymm11,%ymm13                                                      

 0.80 │       vinser $0x1,0x30(%r8,%rax,8),%ymm7,%ymm9                                                        

 0.02 │       vinser $0x1,0x70(%r8,%rax,8),%ymm15,%ymm4                                                       

 0.82 │       vinser $0x1,0x30(%rdi),%ymm8,%ymm10                                                             

 0.03 │       vinser $0x1,0x50(%rdi),%ymm12,%ymm14                                                            

 1.11 │       vinser $0x1,0x70(%rdi),%ymm3,%ymm5    

(The next three multiplications)                                                      

 0.15 │       vmulpd %ymm10,%ymm9,%ymm1                                                                       ◆

 0.33 │       vmulpd %ymm14,%ymm13,%ymm0                                                                      

 1.79 │       vmulpd %ymm5,%ymm4,%ymm10                                                                       

 3.55 │       vmovup 0x80(%rdi),%xmm7                                                                         

      │                                                                                                       

      │     #ifdef DEBUG                                                                                      

      │         cout << “edge src: ” << s << ” dst: ” << d << ” rating: ” << edgeLen << endl;                 

      │     #endif                                                                                            

      │                                                                                                       

      │         double estimate = 0;                                                                          

 0.30 │       vaddpd %ymm1,%ymm2,%ymm1                                                                        

 1.23 │       vaddpd %ymm10,%ymm0,%ymm0                                                                       

 1.92 │       vaddpd %ymm0,%ymm1,%ymm2                                                                        

      │         double * __restrict __attribute ((align_value(32) ))  cur_latent =  &latent_curr[current_offse

      │         double * __restrict __attribute ((align_value(32) )) ngh_latent = &latent_curr[ngh_offset];   

      │                                                                                                       

      │         for(int i = 0; i < K; i++){                                                                   

      │           //estimate += latent_curr[current_offset + i]*latent_curr[ngh_offset + i];                  

      │           estimate +=  cur_latent[i]*ngh_latent[i];                                                   

 0.29 │       vinser $0x1,0x90(%rdi),%ymm7,%ymm9                                                              

 0.71 │       vinser $0x1,0x90(%r8,%rax,8),%ymm6,%ymm8                                                        

      │         cout << “estimate: ” << estimate << endl;                                                     

      │     #endif                                                                                            

      │                                                                                                       

      │         double * __restrict __attribute ((align_value(32) ))  cur_error = &error[current_offset];  

     │         for (int i = 0; i < K; i++){                                                                  

 0.00 │       xor    %al,%al                                                                                  

      │         double * __restrict __attribute ((align_value(32) ))  cur_latent =  &latent_curr[current_offse

      │         double * __restrict __attribute ((align_value(32) )) ngh_latent = &latent_curr[ngh_offset];   

      │                                                                                                       

      │         for(int i = 0; i < K; i++){                                                                   

      │           //estimate += latent_curr[current_offset + i]*latent_curr[ngh_offset + i];                  

      │           estimate +=  cur_latent[i]*ngh_latent[i];                                                   

 0.41 │       vmulpd %ymm9,%ymm8,%ymm11                                                                       

      │                                                                                                       

      │     #ifdef DEBUG                                                                                      

      │         cout << “edge src: ” << s << ” dst: ” << d << ” rating: ” << edgeLen << endl;                 

      │     #endif                                                                                            

      │                                                                                                       

      │         double estimate = 0;                                                                          

 1.45 │       vaddpd %ymm11,%ymm2,%ymm12                                                                      ◆

      │                                                                                                       

      │         for(int i = 0; i < K; i++){                                                                   

      │           //estimate += latent_curr[current_offset + i]*latent_curr[ngh_offset + i];                  

      │           estimate +=  cur_latent[i]*ngh_latent[i];                                                   

      │         }                                                                                             

      │         double err = edgeLen – estimate;                                                              

 0.13 │       vxorpd %xmm2,%xmm2,%xmm2                                                                        

 0.01 │       vcvtsi %edx,%xmm2,%xmm2                                                                         

      │         cout << “estimate: ” << estimate << endl;                                                     

      │     #endif                                                                       

Advertisements
This entry was posted in High Performance Computing and tagged . Bookmark the permalink.

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s