forked from devAmoghS/Machine-Learning-with-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
112 lines (77 loc) · 3.51 KB
/
Copy pathutils.py
File metadata and controls
112 lines (77 loc) · 3.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import math
from collections import Counter, defaultdict
from functools import partial
def entropy(class_probabilities):
"""given a list of class probabilities, compute the entropy"""
return sum(-p * math.log(p, 2) for p in class_probabilities if p)
def class_probabilities(labels):
total_count = len(labels)
return [count / total_count
for count in Counter(labels).values()]
def data_entropy(labeled_data):
labels = [label for _, label in labeled_data]
probabilities = class_probabilities(labels)
return entropy(probabilities)
def partition_entropy(subsets):
"""find the entropy from this partition of data into subsets"""
total_count = sum(len(subset) for subset in subsets)
return sum(data_entropy(subset) * len(subset) / total_count
for subset in subsets)
def group_by(items, key_fn):
"""returns a defaultdict(list), where each input item
is in the list whose key is key_fn(item)"""
groups = defaultdict(list)
for item in items:
key = key_fn(item)
groups[key].append(item)
return groups
def partition_by(inputs, attribute):
"""returns a dict of inputs partitioned by the attribute
each input is a pair (attribute_dict, label)"""
return group_by(inputs, lambda x: x[0][attribute])
def partition_entropy_by(inputs, attribute):
"""computes the entropy corresponding to the given partition"""
partitions = partition_by(inputs, attribute)
return partition_entropy(partitions.values())
def classify(tree, input):
"""classify the input using the given decision tree"""
# if this is a leaf node, return its value
if tree in [True, False]:
return tree
# otherwise find the correct subtree
attribute, subtree_dict = tree
subtree_key = input.get(attribute) # None if input is missing attribute
if subtree_key not in subtree_dict: # if no subtree for key,
subtree_key = None # we'll use the None subtree
subtree = subtree_dict[subtree_key] # choose the appropriate subtree
return classify(subtree, input) # and use it to classify the input
def build_tree_id3(inputs, split_candidates=None):
# if this is our first pass,
# all keys of the first input are split candidates
if split_candidates is None:
split_candidates = inputs[0][0].keys()
# count Trues and Falses in the inputs
num_inputs = len(inputs)
num_trues = len([label for item, label in inputs if label])
num_falses = num_inputs - num_trues
if num_trues == 0: # if only Falses are left
return False # return a "False" leaf
if num_falses == 0: # if only Trues are left
return True # return a "True" leaf
if not split_candidates: # if no split candidates left
return num_trues >= num_falses # return the majority leaf
# otherwise, split on the best attribute
best_attribute = min(split_candidates,
key=partial(partition_entropy_by, inputs))
partitions = partition_by(inputs, best_attribute)
new_candidates = [a for a in split_candidates
if a != best_attribute]
# recursively build the subtrees
subtrees = {attribute: build_tree_id3(subset, new_candidates)
for attribute, subset in partitions.items()}
subtrees[None] = num_trues > num_falses # default case
return best_attribute, subtrees
def forest_classify(trees, input):
votes = [classify(tree, input) for tree in trees]
vote_counts = Counter(votes)
return vote_counts.most_common(1)[0][0]