大数据去重――位图

匿名 (未验证) 提交于 2019-12-03 00:25:02

100亿整型数据去重?


整型数据为32位最多有2^32(42亿多),所以100亿整型数据一定有重复的,2^32个整形用位表示,需要(2^32)bit==512MB,需要512MB内存表示。

下面是去重算法:

#include <stdio.h> #include <stdlib.h>  #define MAX (0xffffffff)  void setBuf(char *buf, unsigned int num) {     *(buf+(num>>0x3)) |= (0x1<<(num&0x7));     return; } unsigned int getBuf(char *buf, int num) {     unsigned int flag = 0;     flag = ((*(buf+(num>>0x3)) & (0x1<<(num&0x7))) != 0)? 1:0;     return flag; }  int main(int argc,char **argv) {     if(argc < 2)     {          printf("usage:./a {0-9}*\n");         return 0;     }     unsigned int index = 1;     unsigned int num;     unsigned int max = 0;     char* buf = (char*)calloc((MAX>>0x3)+1,sizeof(char));     while(index < argc)         {         num = atoi(argv[index]);         max = max>num? max:num;         setBuf(buf,num);         ++index;     }     for(index = 0; index <= max; index++)     {            if(getBuf(buf,index) == 1)         {            printf("id:%-10u flag:0x%-16x value:%-10u state:%-2d\n",                    index>>0x3,                    (unsigned int)buf[index>>0x3],                    index,                    getBuf(buf,index));         }         printf("process[%u]:%.2f%%\r",index,(float)(index)/max*100);     }     printf("\n");     return 0; }

测试结果:

[root@centos code]# ./a.out 100 100 45 4 53 4 23 23 23 24 35 454 6 4 6543  3242 2 324 54 6 23 23 2 32 4 354 654 65 6 1000 id:0          flag:0x54               value:2          state:1  id:0          flag:0x54               value:4          state:1  id:0          flag:0x54               value:6          state:1  id:2          flag:0xffffff80         value:23         state:1  id:3          flag:0x1                value:24         state:1  id:4          flag:0x9                value:32         state:1  id:4          flag:0x9                value:35         state:1  id:5          flag:0x20               value:45         state:1  id:6          flag:0x60               value:53         state:1  id:6          flag:0x60               value:54         state:1  id:8          flag:0x2                value:65         state:1  id:12         flag:0x10               value:100        state:1  id:40         flag:0x10               value:324        state:1  id:44         flag:0x4                value:354        state:1  id:56         flag:0x40               value:454        state:1  id:81         flag:0x40               value:654        state:1  id:125        flag:0x1                value:1000       state:1  id:405        flag:0x4                value:3242       state:1  id:817        flag:0xffffff80         value:6543       state:1  process[6543]:100.00%


易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!