Page Menu
Home
DevCentral
Search
Configure Global Search
Log In
Files
F25451752
extract-proper-nouns
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Flag For Later
Size
1 KB
Referenced Files
None
Subscribers
None
extract-proper-nouns
View Options
#!/usr/bin/env python
import
sys
import
os.path
import
nltk
from
nltk.tag
import
pos_tag
from
nltk.tokenize
import
word_tokenize
from
nltk.tokenize.punkt
import
PunktWordTokenizer
#
# Parses arguments
#
if
len
(
sys
.
argv
)
!=
2
:
print
'Usage:'
,
sys
.
argv
[
0
],
'<text file>'
sys
.
exit
(
1
)
filename
=
sys
.
argv
[
1
]
if
not
os
.
path
.
exists
(
filename
):
print
filename
+
': no such file'
sys
.
exit
(
2
)
#
# Extract proper nouns
#
with
open
(
filename
,
"r"
)
as
file
:
text
=
file
.
read
()
.
replace
(
'
\n
'
,
' '
)
.
strip
()
# First, the punkt tokenizer divides our text in sentences.
# Each sentence is then tokenized and POS tagged.
#
# Proper nouns receive the tags 'NPP', we discard first words of sentence to
# reduce the false positive rate. For example, in the following sentence,
# onomatopoeias are tagged as NPP: "Bang! Ssssssss! It exploded.".
sent_detector
=
nltk
.
data
.
load
(
'tokenizers/punkt/english.pickle'
)
for
sentence
in
sent_detector
.
tokenize
(
text
):
tokenizedSentence
=
word_tokenize
(
sentence
)
taggedSentence
=
pos_tag
(
tokenizedSentence
)
start
=
True
currentCandidate
=
[]
for
word
,
pos
in
taggedSentence
:
if
start
:
start
=
False
continue
if
pos
==
'NNP'
:
currentCandidate
.
append
(
word
)
continue
if
len
(
currentCandidate
)
>
0
:
print
' '
.
join
(
currentCandidate
)
currentCandidate
=
[]
if
len
(
currentCandidate
)
>
0
:
print
' '
.
join
(
currentCandidate
)
File Metadata
Details
Attached
Mime Type
text/x-python
Expires
Thu, Apr 16, 05:33 (1 d, 4 h)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
3626577
Default Alt Text
extract-proper-nouns (1 KB)
Attached To
Mode
rEPN extract-proper-nouns
Attached
Detach File
Event Timeline
Log In to Comment