1
2
3
4
5
6
7
8
9
10 """
11 Read Named Entity tagged data as chunk structures from the CONLL-2002 Corpus
12 """
13
14 from nltk_lite.corpora import get_basedir
15 from nltk_lite import tokenize, chunk
16 from nltk_lite.parse import tree
17 import os
18
19 esp = ['esp.train', 'esp.testa', 'esp.testb']
20 ned = ['ned.train', 'ned.testa', 'ned.testb']
21 items = esp + ned
22
23 item_name = {
24 'ned.train': 'Dutch training set',
25 'ned.testa': 'Dutch test set a',
26 'ned.testb': 'Dutch test set b',
27 'esp.train': 'Spanish training set',
28 'esp.testa': 'Spanish test set a',
29 'ned.testb': 'Spanish test set b',
30 }
31
34
35 -def raw(files = ['ned.train', 'esp.train']):
45
46 -def pos_tagged(files = ['ned.train', 'esp.train']):
56
57 -def ne_chunked(files = ['ned.train', 'esp.train'], chunk_types=('LOC','ORG','PER')):
70
72 from nltk_lite.corpora import conll2002
73 from itertools import islice
74
75 print "CONLL2002 NE data\n"
76
77 print "Raw text -- Dutch:"
78 for sent in islice(conll2002.raw(files = ['ned.train']), 0, 5):
79 print sent
80 print
81
82 print "Raw text --Spanish:"
83 for sent in islice(conll2002.raw(files = ['esp.train']), 0, 5):
84 print sent
85 print
86
87 print "POS Tagged text -- Dutch:"
88 for sent in islice(conll2002.pos_tagged(files = ['ned.train']), 0, 5):
89 print sent
90 print
91
92 print "POS Tagged text --Spanish:"
93 for sent in islice(conll2002.pos_tagged(files = ['esp.train']), 0, 5):
94 print sent
95 print
96
97 print "Named Entity chunked text -- Dutch:"
98 for tree in islice(conll2002.ne_chunked(files = ['ned.train']), 0, 5):
99 print tree.pp()
100 print
101
102 print "Named Entity chunked text --Spanish:"
103 for tree in islice(conll2002.ne_chunked(files = ['esp.train']), 0, 5):
104 print tree.pp()
105 print
106
107
108 if __name__ == '__main__':
109 demo()
110