How to count character occurrences using SIMD

后端 未结 3 819
耶瑟儿~
耶瑟儿~ 2020-12-21 13:04

I am given a array of lowercase characters (up to 1.5Gb) and a character c. And I want to find how many occurrences are of the character c using AVX instructions.

         


        
3条回答
  •  遥遥无期
    2020-12-21 13:46

    Probably the fastest: memcount_avx2 and memcount_sse2

    size_t memcount_avx2(const void *s, int c, size_t n) 
    {    
      __m256i cv = _mm256_set1_epi8(c), 
              zv = _mm256_setzero_si256(), 
             sum = zv, acr0,acr1,acr2,acr3;
      const char *p,*pe;    
    
      for(p = s; p != (char *)s+(n- (n % (252*32)));) 
      { 
        for(acr0 = acr1 = acr2 = acr3 = zv, pe = p+252*32; p != pe; p += 128) 
        {
          acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)p))); 
          acr1 = _mm256_sub_epi8(acr1, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+32)))); 
          acr2 = _mm256_sub_epi8(acr2, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+64)))); 
          acr3 = _mm256_sub_epi8(acr3, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+96)))); 
          __builtin_prefetch(p+1024);
        }
        sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
        sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr1, zv));
        sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr2, zv));
        sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr3, zv));
      } 
    
      for(acr0 = zv; p+32 < (char *)s + n; p += 32)  
        acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)p))); 
      sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
    
      size_t count = _mm256_extract_epi64(sum, 0) 
                   + _mm256_extract_epi64(sum, 1) 
                   + _mm256_extract_epi64(sum, 2) 
                   + _mm256_extract_epi64(sum, 3);  
    
      while(p != (char *)s + n) 
          count += *p++ == c;
      return count;
    }
    

    Benchmark skylake i7-6700 - 3.4GHz - gcc 8.3:

    memcount_avx2 : 28 GB/s
    memcount_sse: 23 GB/s
    char_count_AVX2 : 23 GB/s (from post)

提交回复
热议问题