1
2
3
4
5
6
7 from nltk_lite.contrib.classifier_tests import *
8 from nltk_lite.contrib.classifier import discretise
9 from nltk_lite.contrib.classifier import numrange as nr, instances as ins, format
10 from nltk_lite.contrib.classifier.exceptions import invaliddataerror as inv
11
12
15 disc = discretise.Discretise()
16 disc.parse(['-a', 'UEW', '-t', 'path', '-T', 'path1,path2', '-A', '3,4,5', '-o', '3,2,4'])
17 algorithm = disc.values.ensure_value('algorithm', None)
18 training = disc.values.ensure_value('training', None)
19 test = disc.values.ensure_value('test', None)
20 attributes = disc.values.ensure_value('attributes', None)
21 options = disc.values.ensure_value('options', None)
22
23 self.assertEqual('UEW', algorithm)
24 self.assertEqual('path', training)
25 self.assertEqual('path1,path2', test)
26 self.assertEqual('3,4,5', attributes)
27 self.assertEqual('3,2,4', options)
28
30 path = datasetsDir(self) + 'numerical' + SEP + 'person'
31 disc = DiscretiseStub()
32 self.assertFalse(disc.error_called)
33 disc.parse(['-a', 'UEW', '-t', path, '-T', path + '.test,' + path + 'extra.test', '-A', '3,4,5'])
34 disc.execute()
35 self.assertTrue(disc.error_called)
36 self.assertEqual('Invalid arguments. One or more required arguments are not present.', disc.message)
37
39 path = datasetsDir(self) + 'numerical' + SEP + 'person'
40 disc = DiscretiseStub()
41 self.assertFalse(disc.error_called)
42
43 disc.parse(['-a', 'NS', '-t', path, '-T', path + '.test,' + path + 'extra.test', '-A', '3,4,5'])
44 disc.execute()
45
46 self.assertFalse(disc.error_called)
47
49 path = datasetsDir(self) + 'numerical' + SEP + 'person'
50 training, attributes, klass, test, gold = self.get_instances(path, True, False)
51 disc = discretise.Discretiser(training, attributes, klass, test, gold, [0,1,4,5,6,7], [2,3,2,3,4,2])
52 self.assertEqual(6, len(disc.training))
53 self.assertEqual(2, len(disc.test))
54 self.assertEqual([0, 1, 4, 5, 6, 7], disc.attribute_indices)
55 self.assertEqual([2, 3, 2, 3, 4, 2], disc.options)
56
58 path = datasetsDir(self) + 'numerical' + SEP + 'person'
59 training, attributes, klass, test, gold = self.get_instances(path, True, False)
60 disc = discretise.Discretiser(training, attributes, klass, test, gold, [1,4,5,6,7], [3,2,3,4,2])
61 self.assertTrue(disc.attributes[0].is_continuous())
62 self.assertTrue(disc.attributes[1].is_continuous())
63 self.assertTrue(disc.attributes[4].is_continuous())
64 self.assertTrue(disc.attributes[5].is_continuous())
65 self.assertTrue(disc.attributes[6].is_continuous())
66 self.assertTrue(disc.attributes[7].is_continuous())
67 self.assertEqual(25, disc.training[0].value(disc.attributes[1]))
68 self.assertEqual(26, disc.test[0].value(disc.attributes[1]))
69 disc.unsupervised_equal_width()
70 self.assertTrue(disc.attributes[0].is_continuous())
71 self.assertFalse(disc.attributes[1].is_continuous())
72 self.assertFalse(disc.attributes[4].is_continuous())
73 self.assertFalse(disc.attributes[5].is_continuous())
74 self.assertFalse(disc.attributes[6].is_continuous())
75 self.assertFalse(disc.attributes[7].is_continuous())
76 self.assertEqual('a', disc.training[0].value(disc.attributes[1]))
77 self.assertEqual('a', disc.test[0].value(disc.attributes[1]))
78
80 path = datasetsDir(self) + 'numerical' + SEP + 'person'
81 training, attributes, klass, test, gold = self.get_instances(path, True, False)
82 disc = discretise.Discretiser(training, attributes, klass, test, gold, [4,6], [2,4])
83 disc_attrs = disc.discretised_attributes([nr.Range(0, 2), nr.Range(0, 120000)])
84 self.assertEqual(2, len(disc_attrs))
85 self.assertEqual(4, disc_attrs[0].index)
86 self.assertEqual(2, len(disc_attrs[0].values))
87 self.assertEqual(4, len(disc_attrs[1].values))
88
90 path = datasetsDir(self) + 'numerical' + SEP + 'person'
91 try:
92 training, attributes, klass, test, gold = self.get_instances(path, True, False)
93 disc = discretise.Discretiser(training, attributes, klass, test, gold, [4,6], [2,0])
94 self.fail('should raise error as an option is zero')
95 except inv.InvalidDataError:
96 pass
97
99 ranges = discretise.ranges_from_chunks([[6, 6, 7, 7, 8], [9, 10, 10, 13, 14], [15, 16, 16, 16, 19]])
100 self.assertEqual(3, len(ranges))
101 self.assertTrue(ranges[0].includes(6))
102 self.assertTrue(ranges[0].includes(8))
103 self.assertTrue(ranges[0].includes(8.9))
104 self.assertTrue(ranges[1].includes(9))
105 self.assertTrue(ranges[1].includes(14))
106 self.assertTrue(ranges[2].includes(15))
107 self.assertTrue(ranges[2].includes(19))
108
110 chunks = discretise.get_chunks_with_frequency([6, 6, 7, 7, 8, 8, 8, 9, 10, 10, 13, 14, 14, 15, 16, 16, 16, 19], 5)
111 self.assertEqual(3, len(chunks))
112 self.assertEqual([[6, 6, 7, 7, 8], [9, 10, 10, 13, 14], [15, 16, 16, 16, 19]], chunks)
113
115 path = datasetsDir(self) + 'numerical' + SEP + 'weather'
116 training, attributes, klass, test, gold = self.get_instances(path)
117 disc = discretise.Discretiser(training, attributes, klass, test, gold, [1], [3])
118 self.assertTrue(disc.attributes[1].is_continuous())
119 self.assertEqual(27.5, disc.training[0].value(disc.attributes[1]))
120 self.assertEqual(32, disc.training[2].value(disc.attributes[1]))
121 self.assertEqual(25.4, disc.test[0].value(disc.attributes[1]))
122 values = disc.training.values_grouped_by_attribute([disc.attributes[1]])
123 values[0].sort()
124 self.assertEqual([6.0, 9.0, 9.0, 10.699999999999999, 12.0, 12.0, 12.0, 14.1, 18.0, 27.5, 32.0, 33.100000000000001], values[0])
125
126 disc.unsupervised_equal_frequency()
127
128 self.assertFalse(disc.attributes[1].is_continuous())
129 self.assertEqual(4, len(disc.attributes[1].values))
130 self.assertEqual('c', disc.training[0].value(disc.attributes[1]))
131 self.assertEqual('d', disc.training[2].value(disc.attributes[1]))
132 self.assertEqual('c', disc.test[0].value(disc.attributes[1]))
133
135 path = datasetsDir(self) + 'numerical' + SEP + 'person'
136 training, attributes, klass, test, gold = self.get_instances(path, True, False)
137 disc = discretise.Discretiser(training, attributes, klass, test, gold, [1])
138 self.assertEqual(1, len(disc.attributes[1].values))
139
140 disc.naive_supervised()
141
142 self.assertEqual(3, len(disc.attributes[1].values))
143
145 path = datasetsDir(self) + 'numerical' + SEP + 'person'
146 training, attributes, klass, test, gold = self.get_instances(path, True, False)
147 disc = discretise.Discretiser(training, attributes, klass, test, gold, [4,6], [2,2])
148 self.assertEqual(2, len(disc.subset))
149 self.assertEqual(4, disc.subset[0].index)
150 self.assertEqual(6, disc.subset[1].index)
151
152 - def get_instances(self, path, get_test = True, get_gold = True):
160
161
167
168 - def error(self, message):
169
170 self.message = message
171 self.error_called = True
172
176