Page MenuHomeDevCentral

extract-proper-nouns
No OneTemporary

extract-proper-nouns

#!/usr/bin/env python
import sys
import os.path
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.tokenize.punkt import PunktWordTokenizer
#
# Parses arguments
#
if len(sys.argv) != 2:
print 'Usage:', sys.argv[0], '<text file>'
sys.exit(1)
filename = sys.argv[1]
if not os.path.exists(filename):
print filename + ': no such file'
sys.exit(2)
#
# Extract proper nouns
#
with open (filename, "r") as file:
text=file.read().replace('\n', ' ').strip()
# First, the punkt tokenizer divides our text in sentences.
# Each sentence is then tokenized and POS tagged.
#
# Proper nouns receive the tags 'NPP', we discard first words of sentence to
# reduce the false positive rate. For example, in the following sentence,
# onomatopoeias are tagged as NPP: "Bang! Ssssssss! It exploded.".
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
for sentence in sent_detector.tokenize(text):
tokenizedSentence = word_tokenize(sentence)
taggedSentence = pos_tag(tokenizedSentence)
start = True
currentCandidate = []
for word, pos in taggedSentence:
if start:
start = False
continue
if pos == 'NNP':
currentCandidate.append(word)
continue
if len(currentCandidate) > 0:
print ' '.join(currentCandidate)
currentCandidate = []
if len(currentCandidate) > 0:
print ' '.join(currentCandidate)

File Metadata

Mime Type
text/x-python
Expires
Thu, Apr 16, 05:33 (1 d, 4 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3626577
Default Alt Text
extract-proper-nouns (1 KB)

Event Timeline