Newer
Older
else:
return ValueError("Decision function must be either 'ovr' or 'ovo'.")
def EnsembleLearner(learners):
"""Given a list of learning algorithms, have them vote."""
def train(dataset):
predictors = [learner(dataset) for learner in learners]
def predict(example):
return mode(predictor(example) for predictor in predictors)
return predict
return train
Donato Meoli
a validé
def ada_boost(dataset, L, K):
"""[Figure 18.34]"""
Donato Meoli
a validé
examples, target = dataset.examples, dataset.target
n = len(examples)
eps = 1 / (2 * n)
w = [1 / n] * n
Donato Meoli
a validé
h, z = [], []
for k in range(K):
h_k = L(dataset, w)
h.append(h_k)
error = sum(weight for example, weight in zip(examples, w) if example[target] != h_k(example))
# avoid divide-by-0 from either 0% or 100% error rates
Donato Meoli
a validé
error = np.clip(error, eps, 1 - eps)
Donato Meoli
a validé
for j, example in enumerate(examples):
if example[target] == h_k(example):
w[j] *= error / (1 - error)
w = normalize(w)
Donato Meoli
a validé
z.append(np.log((1 - error) / error))
Donato Meoli
a validé
return weighted_majority(h, z)
def weighted_majority(predictors, weights):
"""Return a predictor that takes a weighted vote."""
Donato Meoli
a validé
return weighted_mode((predictor(example) for predictor in predictors), weights)
Donato Meoli
a validé
"""
Return the value with the greatest total weight.
>>> weighted_mode('abbaa', [1, 2, 3, 1, 2])
'b'
"""
totals = defaultdict(int)
for v, w in zip(values, weights):
totals[v] += w
return max(totals, key=totals.__getitem__)
Donato Meoli
a validé
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
def RandomForest(dataset, n=5):
"""An ensemble of Decision Trees trained using bagging and feature bagging."""
def data_bagging(dataset, m=0):
"""Sample m examples with replacement"""
n = len(dataset.examples)
return weighted_sample_with_replacement(m or n, dataset.examples, [1] * n)
def feature_bagging(dataset, p=0.7):
"""Feature bagging with probability p to retain an attribute"""
inputs = [i for i in dataset.inputs if probability(p)]
return inputs or dataset.inputs
def predict(example):
print([predictor(example) for predictor in predictors])
return mode(predictor(example) for predictor in predictors)
predictors = [DecisionTreeLearner(DataSet(examples=data_bagging(dataset), attrs=dataset.attrs,
attr_names=dataset.attr_names, target=dataset.target,
inputs=feature_bagging(dataset))) for _ in range(n)]
return predict
def WeightedLearner(unweighted_learner):
Donato Meoli
a validé
"""
[Page 749 footnote 14]
Given a learner that takes just an unweighted dataset, return
one that takes also a weight for each example.
"""
def train(dataset, weights):
return unweighted_learner(replicated_dataset(dataset, weights))
def replicated_dataset(dataset, weights, n=None):
"""Copy dataset, replicating each example in proportion to its weight."""
n = n or len(dataset.examples)
result = copy.copy(dataset)
result.examples = weighted_replicate(dataset.examples, weights, n)
return result
def weighted_replicate(seq, weights, n):
Donato Meoli
a validé
"""
Return n selections from seq, with the count of each element of
seq proportional to the corresponding weight (filling in fractions
randomly).
>>> weighted_replicate('ABC', [1, 2, 1], 4)
['A', 'B', 'B', 'C']
"""
assert len(seq) == len(weights)
weights = normalize(weights)
wholes = [int(w * n) for w in weights]
fractions = [(w * n) % 1 for w in weights]
return (flatten([x] * nx for x, nx in zip(seq, wholes)) +
weighted_sample_with_replacement(n - sum(wholes), seq, fractions))
Donato Meoli
a validé
# metrics
def accuracy_score(y_pred, y_true):
assert y_pred.shape == y_true.shape
return np.mean(np.equal(y_pred, y_true))
def r2_score(y_pred, y_true):
assert y_pred.shape == y_true.shape
return 1. - (np.sum(np.square(y_pred - y_true)) / # sum of square of residuals
np.sum(np.square(y_true - np.mean(y_true)))) # total sum of squares
Donato Meoli
a validé
# datasets
Donato Meoli
a validé
orings = DataSet(name='orings', target='Distressed', attr_names='Rings Distressed Temp Pressure Flightnum')
zoo = DataSet(name='zoo', target='type', exclude=['name'],
Donato Meoli
a validé
attr_names='name hair feathers eggs milk airborne aquatic predator toothed backbone '
'breathes venomous fins legs tail domestic catsize type')
Donato Meoli
a validé
iris = DataSet(name='iris', target='class', attr_names='sepal-len sepal-width petal-len petal-width class')
def RestaurantDataSet(examples=None):
Donato Meoli
a validé
"""
[Figure 18.3]
Build a DataSet of Restaurant waiting examples.
"""
return DataSet(name='restaurant', target='Wait', examples=examples,
Donato Meoli
a validé
attr_names='Alternate Bar Fri/Sat Hungry Patrons Price Raining Reservation Type WaitEstimate Wait')
restaurant = RestaurantDataSet()
Donato Meoli
a validé
def T(attr_name, branches):
branches = {value: (child if isinstance(child, DecisionFork) else DecisionLeaf(child))
for value, child in branches.items()}
Donato Meoli
a validé
return DecisionFork(restaurant.attr_num(attr_name), attr_name, print, branches)
Donato Meoli
a validé
"""
[Figure 18.2]
Surya Teja Cheedella
a validé
A decision tree for deciding whether to wait for a table at a hotel.
"""
waiting_decision_tree = T('Patrons',
{'None': 'No', 'Some': 'Yes',
'Full': T('WaitEstimate',
{'>60': 'No', '0-10': 'Yes',
'30-60': T('Alternate',
{'No': T('Reservation',
{'Yes': 'Yes',
'No': T('Bar', {'No': 'No',
'Yes': 'Yes'})}),
Donato Meoli
a validé
'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}),
'10-30': T('Hungry',
{'No': 'Yes',
'Yes': T('Alternate',
{'No': 'Yes',
'Yes': T('Raining',
{'No': 'No',
'Yes': 'Yes'})})})})})
def SyntheticRestaurant(n=20):
"""Generate a DataSet with n examples."""
def gen():
example = list(map(random.choice, restaurant.values))
Surya Teja Cheedella
a validé
example[restaurant.target] = waiting_decision_tree(example)
return example
Donato Meoli
a validé
return RestaurantDataSet([gen() for _ in range(n)])
def Majority(k, n):
Donato Meoli
a validé
"""
Return a DataSet with n k-bit examples of the majority problem:
k random bits followed by a 1 if more than half the bits are 1, else 0.
"""
examples = []
for i in range(n):
Donato Meoli
a validé
bits = [random.choice([0, 1]) for _ in range(k)]
examples.append(bits)
Donato Meoli
a validé
return DataSet(name='majority', examples=examples)
Donato Meoli
a validé
def Parity(k, n, name='parity'):
"""
Return a DataSet with n k-bit examples of the parity problem:
k random bits followed by a 1 if an odd number of bits are 1, else 0.
"""
examples = []
for i in range(n):
Donato Meoli
a validé
bits = [random.choice([0, 1]) for _ in range(k)]
bits.append(sum(bits) % 2)
examples.append(bits)
return DataSet(name=name, examples=examples)
def Xor(n):
"""Return a DataSet with n examples of 2-input xor."""
Donato Meoli
a validé
return Parity(2, n, name='xor')
def ContinuousXor(n):
"""2 inputs are chosen uniformly from (0.0 .. 2.0]; output is xor of ints."""
examples = []
for i in range(n):
Donato Meoli
a validé
x, y = [random.uniform(0.0, 2.0) for _ in '12']
examples.append([x, y, x != y])
return DataSet(name='continuous xor', examples=examples)
Donato Meoli
a validé
def compare(algorithms=None, datasets=None, k=10, trials=1):
"""
Compare various learners on various datasets using cross-validation.
Print results as a table.
"""
# default list of algorithms
algorithms = algorithms or [PluralityLearner, NaiveBayesLearner, NearestNeighborLearner, DecisionTreeLearner]
Donato Meoli
a validé
# default list of datasets
datasets = datasets or [iris, orings, zoo, restaurant, SyntheticRestaurant(20),
Majority(7, 100), Parity(7, 100), Xor(100)]
Donato Meoli
a validé
print_table([[a.__name__.replace('Learner', '')] + [cross_validation(a, d, k=k, trials=trials) for d in datasets]
for a in algorithms], header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')