Newer
Older
return cross_validation(learner, size, dataset, k=len(dataset.examples))
def learningcurve(learner, dataset, trials=10, sizes=None):
if sizes is None:
sizes = list(range(2, len(dataset.examples) - 10, 2))
def score(learner, size):
random.shuffle(dataset.examples)
return train_and_test(learner, dataset, 0, size)
return [(size, mean([score(learner, size) for t in range(trials)]))
for size in sizes]
# ______________________________________________________________________________
# The rest of this file gives datasets for machine learning problems.
orings = DataSet(name='orings', target='Distressed',
attrnames="Rings Distressed Temp Pressure Flightnum")
zoo = DataSet(name='zoo', target='type', exclude=['name'],
attrnames="name hair feathers eggs milk airborne aquatic " +
"predator toothed backbone breathes venomous fins legs tail " +
iris = DataSet(name="iris", target="class",
attrnames="sepal-len sepal-width petal-len petal-width class")
# ______________________________________________________________________________
# The Restaurant example from [Figure 18.2]
def RestaurantDataSet(examples=None):
"""Build a DataSet of Restaurant waiting examples. [Figure 18.3]"""
return DataSet(name='restaurant', target='Wait', examples=examples,
attrnames='Alternate Bar Fri/Sat Hungry Patrons Price ' +
'Raining Reservation Type WaitEstimate Wait')
restaurant = RestaurantDataSet()
def T(attrname, branches):
branches = {value: (child if isinstance(child, DecisionFork)
else DecisionLeaf(child))
for value, child in branches.items()}
return DecisionFork(restaurant.attrnum(attrname), attrname, branches)
Surya Teja Cheedella
a validé
""" [Figure 18.2]
A decision tree for deciding whether to wait for a table at a hotel.
"""
waiting_decision_tree = T('Patrons',
{'None': 'No', 'Some': 'Yes',
'Full': T('WaitEstimate',
{'>60': 'No', '0-10': 'Yes',
'30-60': T('Alternate',
{'No': T('Reservation',
{'Yes': 'Yes',
'No': T('Bar', {'No': 'No',
'Yes': 'Yes'})}),
'Yes': T('Fri/Sat', {'No': 'No', 'Yes': 'Yes'})}
),
'10-30': T('Hungry',
{'No': 'Yes',
'Yes': T('Alternate',
{'No': 'Yes',
'Yes': T('Raining',
{'No': 'No',
'Yes': 'Yes'})})})})})
def SyntheticRestaurant(n=20):
"""Generate a DataSet with n examples."""
def gen():
example = list(map(random.choice, restaurant.values))
Surya Teja Cheedella
a validé
example[restaurant.target] = waiting_decision_tree(example)
return example
return RestaurantDataSet([gen() for i in range(n)])
# ______________________________________________________________________________
# Artificial, generated datasets.
def Majority(k, n):
"""Return a DataSet with n k-bit examples of the majority problem:
k random bits followed by a 1 if more than half the bits are 1, else 0."""
examples = []
for i in range(n):
bits = [random.choice([0, 1]) for i in range(k)]
examples.append(bits)
return DataSet(name="majority", examples=examples)
def Parity(k, n, name="parity"):
"""Return a DataSet with n k-bit examples of the parity problem:
k random bits followed by a 1 if an odd number of bits are 1, else 0."""
examples = []
for i in range(n):
bits = [random.choice([0, 1]) for i in range(k)]
bits.append(sum(bits) % 2)
examples.append(bits)
return DataSet(name=name, examples=examples)
def Xor(n):
"""Return a DataSet with n examples of 2-input xor."""
return Parity(2, n, name="xor")
def ContinuousXor(n):
"2 inputs are chosen uniformly from (0.0 .. 2.0]; output is xor of ints."
examples = []
for i in range(n):
x, y = [random.uniform(0.0, 2.0) for i in '12']
examples.append([x, y, int(x) != int(y)])
return DataSet(name="continuous xor", examples=examples)
# ______________________________________________________________________________
def compare(algorithms=[PluralityLearner, NaiveBayesLearner,
NearestNeighborLearner, DecisionTreeLearner],
datasets=[iris, orings, zoo, restaurant, SyntheticRestaurant(20),
Majority(7, 100), Parity(7, 100), Xor(100)],
k=10, trials=1):
"""Compare various learners on various datasets using cross-validation.
Print results as a table."""
print_table([[a.__name__.replace('Learner', '')] +
[cross_validation(a, d, k, trials) for d in datasets]
for a in algorithms],
header=[''] + [d.name[0:7] for d in datasets], numfmt='%.2f')