字符串hash（进制hash）洛谷P3370

题目链接：https://www.luogu.org/problemnew/show/P3370

参考了洛谷的博客：https://www.luogu.org/blog/pks-LOVING/zi-fu-chuan-xue-xi-bi-ji-ha-xi-hash-yu-zi-dian-shu-trie

　　　　　　　　　 https://www.luogu.org/blog/yhzq/solution-p3370

对字符串进行进制哈希就是取一个数字base（一般是质数）当做固定的进制，我们将字符串看成这个进制数字，然后将字符串（base进制数）转换成十进制数，这中间一般都会进行取模，防止数据溢出，最终得到的这个值我们就成为这个字符串的哈希值，如果两个字符串的hash值是相同的，我们一般就把这两个字符串看成是相同的字符串。我们的任务就是尽量使不同的字符串哈希之后到的哈希值不同，如果两个不同的字符串的哈希值相同，我们就称它为哈希冲突，当然，我们无法使两个不同的字符串的哈希值一定不同，只能尽量去减小哈希冲突的概率。

为了增加我们的准确率，我们可以对一个字符串进行两次（或者多次）不同的hash，只有所有的哈希值都相同的时候，我们才将两个字符串看成是相等的字符串，虽然这样会增加时间和空间的开销，但是可以明显的提高我们的准确率。

对于mod，我们可以将哈希值定义为undigned long long类型的，这样，在溢出的时候就可以做到自然溢出（自动对2^64取模），这样可以减少程序运行所花费的时间，因为取模是需要花费一定的时间的，当然我们也可以设置一个mod，这个mod一般是一个质数，我取的是比1e18大。

看代码：

代码一：（自然溢出） 108ms

#include<iostream>
#include<cstring>
#include<algorithm>
#include<queue>
#include<map>
#include<stack>
#include<cmath>
#include<vector>
#include<set>
#include<cstdio>
#include<string>
#include<deque> 
using namespace std;
typedef long long LL;
#define eps 1e-8
#define INF 0x3f3f3f3f
#define maxn 10005
typedef unsigned long long ull;//使用ull可以自动对2^64取模
const int base=131;//随便取一个质数 
struct node{
    ull value;//存哈希值 
    int len;//记录一下字符串的长度，这个也可以作为比较两个字符串是否相等的标记 
}a[maxn];
int n,m,k,t; 
char s[maxn];
ull getHash(char *s){//得到字符串s的哈希值 
    ull ans=1;
    for(int i=0;s[i];i++){
        ans=(ans*base+(ull)s[i]);//这里会自然溢出 
    }
    return ans;
}
bool operator <(node s1,node s2){
    if(s1.value!=s2.value)
    return s1.value<s2.value;
    return s1.len<s2.len;
}
int main()
{
    scanf("%d",&n);
    for(int i=0;i<n;i++){
        scanf("%s",&s);
        a[i].len=strlen(s);
        a[i].value=getHash(s);
    }
    sort(a,a+n);//排序 
    int ans=1;
    for(int i=1;i<n;i++){
        if(a[i].value!=a[i-1].value||a[i].len!=a[i-1].len)//只要当前字符串的value或len中的一个和前面的字符串不同，我们就将它们看成是不同的字符串 
        ans++;
    } 
    printf("%d\n",ans);
    return 0;
}

代码二：（多重哈希）421ms

#include<iostream>
#include<cstring>
#include<algorithm>
#include<queue>
#include<map>
#include<stack>
#include<cmath>
#include<vector>
#include<set>
#include<cstdio>
#include<string>
#include<deque> 
using namespace std;
typedef long long LL;
typedef unsigned long long ull;
#define eps 1e-8
#define INF 0x3f3f3f3f
#define maxn 10005
const ull mod=1792348491639061335;//这些数字都是随便取的质数 
const ull basea=131;
const ull baseb=251;
const int prime=104743;
int n,m,k,t;
struct node{
    ull ahash,bhash;//两个不同的hash值 
}a[maxn]; 
char s[maxn];
ull getHash(char *s,int type){
    ull base;
    if(type==1) base=basea;//type==1代表是第一种hash，type==2代表是第二种hash 
    else base=baseb;
    ull ans=1;
    for(int i=0;s[i];i++){
        ans=(ans*base+s[i])%mod+prime;//这里其实可以改成自然溢出 
    }
    return ans;
}
bool operator <(node s1,node s2){
    if(s1.ahash!=s2.ahash)
    return s1.ahash<s2.ahash;
    else
    return s1.bhash<s2.bhash;
}
int main()
{
    scanf("%d",&n);
    for(int i=0;i<n;i++){
        scanf("%s",s);
        a[i].ahash=getHash(s,1);//得到第一种hash值 
        a[i].bhash=getHash(s,2);//得到第二种hash值 
    }
    sort(a,a+n);
    int ans=1;
    for(int i=1;i<n;i++){
        if(a[i].ahash!=a[i-1].ahash||a[i].bhash!=a[i-1].bhash)
        ans++;
    }
    printf("%d\n",ans);
    return 0;
}

思路二：题目要我们求出所有给出的字符串中有多少个不同的字符串，我们可以尝试将这些字符串按照某种方式分组，然后加入对应的分组，查找的时候可以通过计算来找到字符串所属分组，在分组里面查找是否已经有了这个字符串，这样就缩小了范围，当然可能会有一些字符串会有相同的分组，所以我们这里面链表来存一个分组里面的字符串，每一个链表就代表一个分组。

在这里面我们寻找分组的方式就是将字符串看成是255进制的数字，然后在将他转化成为10进制的数字，同时对素数1000007取模，这样就可以得到一个数字（index），我们先在编号为index的链表中查找是否有这个字符串了，有就不加，没有就插入链表并且计数。

代码：

#include<iostream>
#include<cstring>
#include<algorithm>
#include<queue>
#include<map>
#include<stack>
#include<cmath>
#include<vector>
#include<set>
#include<cstdio>
#include<string>
#include<deque> 
using namespace std;
typedef long long LL;
#define eps 1e-8
#define INF 0x3f3f3f3f
#define mod 1000007
const int maxn=1e6+100;
/*struct point{
    int u,w;
};
bool operator <(const point &s1,const point &s2)
{
    if(s1.w!=s2.w)
    return s1.w>s2.w;
    else
    return s1.u>s2.u;
}*/
inline int read(){
    int f=1,x=0;char ch;
    do{ch=getchar();if(ch=='-')f=-1;}while(ch<'0'||ch>'9');
    do{x=x*10+ch-'0';ch=getchar();}while(ch>='0'&&ch<='9');
    return f*x;
}
char str[maxn];
int n,m,k,t,cnt,ans;
int head[maxn];
struct node{
    int next;
    char w[1505];
}edge[10005];
void init(){
    memset(head,-1,sizeof(head));
    cnt=ans=0;
}
void add(char *str,int u){//在编号为u的链表里面增加str 
    strcpy(edge[++cnt].w,str);
    edge[cnt].next=head[u];
    head[u]=cnt;
}
bool find(int u,char *str){//查找编号为u的链表里面是否有str 
    for(int i=head[u];i!=-1;i=edge[i].next){
        if(strcmp(edge[i].w,str)==0)
        return true;
    }
    return false;
}
int main()
{
    scanf("%d",&n);
    init();
    for(int i=0;i<n;i++){
        scanf("%s",str);
        int len=strlen(str);
        int index=0;
        for(int j=len-1;j>=0;j--){
            index=(str[j]*255+index)%mod;//进制转换 
        }
        if(find(index,str)==false){
            add(str,index);
            ans++;
        }
    }
    printf("%d\n",ans);
    return 0;
}

来源：https://www.cnblogs.com/6262369sss/p/10719041.html

标签

进制

哈希

hash

字符串hash（进制hash） 洛谷P3370

字符串hash（进制hash）洛谷P3370