Given a model, e.g.
from gensim.models.word2vec import Word2Vec
documents = [\"Human machine interface for lab abc computer applications\",
\"A survey of u
Have tried and felt that the most straightforward way is as follows:
My sample code is as follows:
line_no = 0 # line0 = header
numEntities=0
targetLines = []
with open(file_entVecs_txt,'r') as fp:
header = fp.readline() # header
while True:
line = fp.readline()
if line == '': #EOF
break
line_no += 1
isLatinFlag = True
for i_l, char in enumerate(line):
if not isLatin(char): # Care about entity that is Latin-only
isLatinFlag = False
break
if char==' ': # reached separator
ent = line[:i_l]
break
if not isLatinFlag:
continue
# Check for numbers in entity
if re.search('\d',ent):
continue
# Check for entities with subheadings '#' (e.g. 'ENTITY/Stereotactic_surgery#History')
if re.match('^ENTITY/.*#',ent):
continue
targetLines.append(line_no)
numEntities += 1
# Update header with new metadata
header_new = re.sub('^\d+',str(numEntities),header,count=1)
# Generate the file
txtWrite('',file_entVecs_SHORT_txt)
txtAppend(header_new,file_entVecs_SHORT_txt)
line_no = 0
ptr = 0
with open(file_entVecs_txt,'r') as fp:
while ptr < len(targetLines):
target_line_no = targetLines[ptr]
while (line_no != target_line_no):
fp.readline()
line_no+=1
line = fp.readline()
line_no+=1
ptr+=1
txtAppend(line,file_entVecs_SHORT_txt)
FYI. FAILED ATTEMPT I tried out @zsozso's method (with the np.array modifications suggested by @Taegyung), left it to run overnight for at least 12 hrs, it was still stuck at getting new words from the restricted set...). This is perhaps because I have a lot of entities... But my text-file method works within an hour.
FAILED CODE
# [FAILED] Stuck at Building new vocab...
def restrict_w2v(w2v, restricted_word_set):
new_vectors = []
new_vocab = {}
new_index2entity = []
new_vectors_norm = []
print('Building new vocab..')
for i in range(len(w2v.vocab)):
if (i%int(1e6)==0) and (i!=0):
print(f'working on {i}')
word = w2v.index2entity[i]
vec = np.array(w2v.vectors[i])
vocab = w2v.vocab[word]
vec_norm = w2v.vectors_norm[i]
if word in restricted_word_set:
vocab.index = len(new_index2entity)
new_index2entity.append(word)
new_vocab[word] = vocab
new_vectors.append(vec)
new_vectors_norm.append(vec_norm)
print('Assigning new vocab')
w2v.vocab = new_vocab
print('Assigning new vectors')
w2v.vectors = np.array(new_vectors)
print('Assigning new index2entity, index2word')
w2v.index2entity = new_index2entity
w2v.index2word = new_index2entity
print('Assigning new vectors_norm')
w2v.vectors_norm = np.array(new_vectors_norm)