I have been playing with NLTK toolkit. I come across this problem a lot and searched for solution online but nowhere I got a satisfying answer. So I am putting my query here
There is a bug in @alvas's answer. Fencepost error. Make sure to run that elif check outside of the loop as well so that you don't leave off a NE that occurs at the end of the sentence. So:
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
def get_continuous_chunks(text):
chunked = ne_chunk(pos_tag(word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for i in chunked:
if type(i) == Tree:
current_chunk.append(" ".join([token for token, pos in i.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
if current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
return continuous_chunk
txt = "Barack Obama is a great person and so is Michelle Obama."
print get_continuous_chunks(txt)
@alvas great answer. It was really helpful. I have tried to capture your solution in a more functional way. Still have to improve on it though.
def conditions(tree_node):
return tree_node.height() == 2
def coninuous_entities(self, input_text, file_handle):
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
# Note: Currently, the chunker categorizes only 2 'NNP' together.
docs = input_text.split('\n')
for input_text in docs:
chunked_data = ne_chunk(pos_tag(word_tokenize(input_text)))
child_data = [subtree for subtree in chunked_data.subtrees(filter = self.filter_conditions)]
named_entities = []
for child in child_data:
if type(child) == Tree:
named_entities.append(" ".join([token for token, pos in child.leaves()]))
# Dump all entities to file for now, we will see how to go about that
if file_handle is not None:
file_handle.write('\n'.join(named_entities) + '\n')
return named_entities
Using conditions function one can add many conditions to filter.
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
def get_continuous_chunks(text):
chunked = ne_chunk(pos_tag(word_tokenize(text)))
prev = None
continuous_chunk = []
current_chunk = []
for i in chunked:
if type(i) == Tree:
current_chunk.append(" ".join([token for token, pos in i.leaves()]))
elif current_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
current_chunk = []
else:
continue
if continuous_chunk:
named_entity = " ".join(current_chunk)
if named_entity not in continuous_chunk:
continuous_chunk.append(named_entity)
return continuous_chunk
txt = "Barack Obama is a great person."
print get_continuous_chunks(txt)
[out]:
['Barack Obama']
But do note that if the continuous chunk are not supposed to be a single NE, then you would be combining multiple NEs into one. I can't think of such an example off my head but i'm sure it would happen. But if they not continuous, the script above works fine:
>>> txt = "Barack Obama is the husband of Michelle Obama."
>>> get_continuous_chunks(txt)
['Barack Obama', 'Michelle Obama']