How to determine probability of words?

有些话、适合烂在心里 提交于 2019-12-04 11:35:57

Presuming you are ignoring TOPIC lines, use a defaultdict to group the values and then do the calculation at the end:

from collections import defaultdict
from itertools import groupby, imap

d = defaultdict(list)
with open("doc1") as f,open("doc2") as f2:
    values = map(float, f2.read().split()) 
    for line in f:
        if line.strip() and not line.startswith("TOPIC"):
            name, val = line.split()
            d[name].append(float(val))

for k,v in d.items():
    print("Prob for {} is {}".format(k ,sum(i*j for i, j in zip(v,values)) ))

Another way would be to do the calcs as you go, increasing a count each time you hit a new section i.e a line with TOPIC to get the correct value from values by indexing:

from collections import defaultdict
d = defaultdict(float)
from itertools import  imap

with open("doc1") as f,open("doc2") as f2:
    # create list of all floats from doc2
    values = imap(float, f2.read().split())
    for line in f:
        # if we have a new TOPIC increase the ind to get corresponding ndex from values
        if line.startswith("TOPIC"):
            ind = next(values)
            continue
        # ignore empty lines
        if line.strip():
            # get word and float and multiply the val by corresponding values value
            name, val = line.split()
            d[name] += float(val) * values[ind]

for k,v in d.items():
    print("Prob for {} is {}".format(k ,v) )

Using you two doc1 content and 0 0.566667 0 0.0333333 0 inside doc2 outputs the following for both:

Prob for web is 0.085187930859
Prob for say is 0.0255701266375
Prob for online is 0.0076985327511
Prob for site is 0.0293277438137
Prob for Internet is 0.00870667394471

You could also use itertools groupby:

from collections import defaultdict
d = defaultdict(float)
from itertools import groupby, imap

with open("doc1") as f,open("doc2") as f2:
    values = imap(float, f2.read().split())
    # lambda x: not(x.strip()) will split into groups on the empty lines
    for ind, (k, v) in enumerate(groupby(f, key=lambda x: not(x.strip()))):
        if not k:
            topic = next(v) 
            #  get matching float from values
            f = next(values)
            # iterate over the group 
            for s in v:
                name, val = s.split()
                d[name] += (float(val) * f)
for k,v in d.iteritems():
    print("Prob for {} is {}".format(k,v))

For python3 all the itertools imaps should be changed to just map which also returns an iterator in python3.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!