Joseph Kliegman / Feb 27 2019
Equipment Success Prediction
Equipment Success Prediction
from __future__ import print_function import math from IPython import display from matplotlib import cm from matplotlib import gridspec from matplotlib import pyplot as plt from mpl_toolkits.mplot3d import Axes3D import numpy as np import pandas as pd from sklearn import metrics import tensorflow as tf from tensorflow import keras import keras import sklearn from tensorflow.keras import layers from tensorflow.python.data import Dataset tf.logging.set_verbosity(tf.logging.ERROR) pd.options.display.max_rows = 10 pd.options.display.float_format = '{:.1f}'.format
keras.__version__
sklearn.__version__
! ls
Language:Python
equipment_success_dataframe = pd.read_csv(equipment_success_unique.csv↩, sep=",") equipment_success_dataframe['decades_old'] = round(equipment_success_dataframe['age_in_months_at_equipped_start'] / 120) equipment_success_dataframe['years'] = round(equipment_success_dataframe['age_in_months_at_equipped_start'] / 12) def isMale(x): if x == "male": return 1 return 0 def isFemale(x): if x == "female": return 1 return 0 def isRightEar(x): if x == "R": return 1 return 0 def isLeftEar(x): if x == "L": return 1 return 0 def isBothEars(x): if x == "BIN": return 1 return 0 def dbToAmp (x): return 10**(x/20) def genderToNum (x): if x == "male": return 1 if x == "female": return 2 return 0 equipment_success_dataframe['genderNum'] = equipment_success_dataframe['gender'].apply(genderToNum) equipment_success_dataframe['isMale'] = equipment_success_dataframe['gender'].apply(isMale) equipment_success_dataframe['isFemale'] = equipment_success_dataframe['gender'].apply(isFemale) equipment_success_dataframe['isRightEar'] = equipment_success_dataframe['ears'].apply(isRightEar) equipment_success_dataframe['isLeftEar'] = equipment_success_dataframe['ears'].apply(isLeftEar) equipment_success_dataframe['isBothEars'] = equipment_success_dataframe['ears'].apply(isBothEars) ''' equipment_success_dataframe['freq_500'] = dbToAmp(equipment_success_dataframe['freq_500']) equipment_success_dataframe['freq_1000'] = dbToAmp(equipment_success_dataframe['freq_1000']) equipment_success_dataframe['freq_2000'] = dbToAmp(equipment_success_dataframe['freq_2000']) equipment_success_dataframe['freq_4000'] = dbToAmp(equipment_success_dataframe['freq_4000']) equipment_success_dataframe['eq_freq_500'] = dbToAmp(equipment_success_dataframe['eq_freq_500']) equipment_success_dataframe['eq_freq_1000'] = dbToAmp(equipment_success_dataframe['eq_freq_1000']) equipment_success_dataframe['eq_freq_2000'] = dbToAmp(equipment_success_dataframe['eq_freq_2000']) equipment_success_dataframe['eq_freq_4000'] = dbToAmp(equipment_success_dataframe['eq_freq_4000']) ''' equipment_success_dataframe['freq_500_over_freq_1000'] = equipment_success_dataframe['freq_500']/equipment_success_dataframe['freq_1000'] equipment_success_dataframe['freq_2000_over_freq_1000'] = equipment_success_dataframe['freq_2000']/equipment_success_dataframe['freq_1000'] equipment_success_dataframe['freq_4000_over_freq_1000'] = equipment_success_dataframe['freq_4000']/equipment_success_dataframe['freq_1000'] equipment_success_dataframe['freq_500_gain'] = equipment_success_dataframe['freq_500'] - equipment_success_dataframe['eq_freq_500'] equipment_success_dataframe['freq_1000_gain'] = equipment_success_dataframe['freq_1000'] - equipment_success_dataframe['eq_freq_1000'] equipment_success_dataframe['freq_2000_gain'] = equipment_success_dataframe['freq_2000'] - equipment_success_dataframe['eq_freq_2000'] equipment_success_dataframe['freq_4000_gain'] = equipment_success_dataframe['freq_4000'] - equipment_success_dataframe['eq_freq_4000'] equipment_success_dataframe['freq_500_gain_ratio'] = equipment_success_dataframe['freq_500_gain'] / equipment_success_dataframe['freq_500'] equipment_success_dataframe['freq_1000_gain_ratio'] = equipment_success_dataframe['freq_1000_gain'] / equipment_success_dataframe['freq_1000'] equipment_success_dataframe['freq_2000_gain_ratio'] = equipment_success_dataframe['freq_2000_gain'] / equipment_success_dataframe['freq_2000'] equipment_success_dataframe['freq_4000_gain_ratio'] = equipment_success_dataframe['freq_4000_gain'] / equipment_success_dataframe['freq_4000'] # equipment_success_dataframe['eq_gain_ratio'] = (-equipment_success_dataframe['eq_average_loss'] + equipment_success_dataframe['average_loss'])/equipment_success_dataframe['average_loss'] equipment_success_dataframe = equipment_success_dataframe.drop(columns=["eq_average_loss", "eq_high_loss", "eq_low_loss", "rk"]) equipment_success_dataframe
Language:Python
equipment_success_dataframe['valid'] = ( # equipment_success_dataframe['freq_250'].apply(lambda x : not math.isnan(x)) & equipment_success_dataframe['freq_500'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['freq_750'].apply(lambda x : not math.isnan(x)) & equipment_success_dataframe['freq_1000'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['freq_1500'].apply(lambda x : not math.isnan(x)) & equipment_success_dataframe['freq_2000'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['freq_3000'].apply(lambda x : not math.isnan(x)) & equipment_success_dataframe['freq_4000'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['freq_6000'].apply(lambda x : not math.isnan(x)) & equipment_success_dataframe['freq_8000'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['eq_freq_500'].apply(lambda x : not math.isnan(x)) & equipment_success_dataframe['eq_freq_1000'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['eq_freq_2000'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['eq_freq_4000'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['freq_500_gain'].apply(lambda x : not math.isnan(x) and x > 0) & equipment_success_dataframe['freq_1000_gain'].apply(lambda x : not math.isnan(x) and x > 0) & # equipment_success_dataframe['freq_2000_gain'].apply(lambda x : not math.isnan(x) and x > 0) & # equipment_success_dataframe['freq_4000_gain'].apply(lambda x : not math.isnan(x) and x > 0) & # equipment_success_dataframe['freq_500_gain_ratio'].apply(lambda x : not math.isnan(x) and x < 1) & equipment_success_dataframe['freq_1000_gain_ratio'].apply(lambda x : not math.isnan(x) and x < 1) & # equipment_success_dataframe['freq_2000_gain_ratio'].apply(lambda x : not math.isnan(x) and x < 1) & # equipment_success_dataframe['freq_4000_gain_ratio'].apply(lambda x : not math.isnan(x) and x < 1) & equipment_success_dataframe['isBothEars'].apply(lambda x : x == 0) & equipment_success_dataframe['average_loss'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['high_loss'].apply(lambda x : not math.isnan(x)) & # equipment_success_dataframe['low_loss'].apply(lambda x : not math.isnan(x)) & equipment_success_dataframe['months_since_equipped'].apply(lambda x : x > 12) & equipment_success_dataframe['decades_old'].apply(lambda x : not math.isnan(x) and x < 11 and x > 4) ) equipment_success_dataframe.query(' valid == True')
#equipment_success = equipment_success_dataframe.query('center_id == 29 and valid == True').copy() equipment_success_valid = equipment_success_dataframe.query(' valid == True').copy() equipment_success_valid
for feature in ['eq_freq_1000','freq_1000_gain_ratio', 'freq_2000_gain_ratio']: display.display(equipment_success_valid.hist(feature))
def scaleInner(df, feature): stats = df[feature].describe(); std = stats['std'] mean = stats['mean'] df[feature + '_scaled'] = (df[feature] - mean)/std return df def scale(df, features): scaledDf = df.copy() for feature in features: scaleInner(scaledDf, feature) return scaledDf
def success_ratio(equipment_success, freq, multiplier, threshold ): data = pd.value_counts( equipment_success['freq_' + freq + '_gain'] > (equipment_success['freq_' + freq] * multiplier) - threshold ) return data[True]/(data[False] + data[True])
[success_ratio(equipment_success_valid, '500', 0.5, 5), success_ratio(equipment_success_valid, '1000', 0.5, 5), success_ratio(equipment_success_valid, '2000', 0.5, 5), success_ratio(equipment_success_valid, '4000', 0.5, 5)]
def model_good_prediction_ratio_array(targets, predictions, max_distance): predictionsDiff = list(map(lambda x: abs(x), list(targets - predictions))) return len(list(filter(lambda x: x < max_distance, predictionsDiff)))/len(predictionsDiff)
def model_good_prediction_ratio(targets, predictions, max_distance): predictionsDiff = (targets - predictions).apply (lambda x: abs(x)) return len(list(filter(lambda x: x < max_distance, predictionsDiff)))/len(predictionsDiff)
equipment_success = equipment_success_valid.copy() #[(equipment_success_valid['isMale'] == 0) # & (equipment_success_valid['decades_old'] == 7) #].copy() equipment_success
def safe_append(arr, x): if arr == None: return [x] arr.append(x) return arr def safe_inc(n): if n == None: return 1 return n + 1
center_ids = equipment_success['center_id'].values tests_per_center = {} for i in range(len(center_ids)): center_id = center_ids[i] tests_per_center[center_id] = safe_inc(tests_per_center.get(center_id))
1. Keras
1.
Keras
from keras.models import Sequential, Model from keras.layers import Dense, Activation, Input, advanced_activations from keras import optimizers from keras import regularizers
keras.__version__
equipment_success_valid = scale(equipment_success_valid, ['freq_250', 'freq_500', 'freq_750', 'freq_1000', 'freq_1500', 'freq_2000', 'freq_3000', 'freq_4000', 'freq_6000', 'freq_8000', 'years', 'decades_old', 'age_in_months_at_equipped_start']) equipment_success_all = equipment_success_valid.copy() #[(equipment_success_valid['isFemale'] == 1) # & (equipment_success_valid['decades_old'] == 8) # ].copy() equipment_success = equipment_success_all.sample(frac=1) equipment_success
features = [ "years_scaled", "isMale", "isFemale", #"isLeftEar", #"isRightEar", # 'freq_250_scaled', 'freq_500_scaled', #'freq_750_scaled', 'freq_1000_scaled', #'freq_1500_scaled', 'freq_2000_scaled', #'freq_3000_scaled', 'freq_4000_scaled', #'freq_6000_scaled', 'freq_8000_scaled', #'freq_500_over_freq_1000', #'freq_2000_over_freq_1000', #'freq_4000_over_freq_1000' ] data = (equipment_success[features].values) labels = (equipment_success['eq_freq_1000'].values) print("original good prediction ratio: ", model_good_prediction_ratio_array(labels, np.average(labels), 5)) print("original rmse: " , np.std(labels - np.average(labels)))
2. data exploration
2.
data exploration
original_data = data.copy() original_labels = labels.copy()
unscaled_features = [ "age_in_months_at_equipped_start", "isMale", "isFemale", 'freq_250', 'freq_500', 'freq_750', 'freq_1000', 'freq_1500', 'freq_2000', 'freq_3000', 'freq_4000', 'freq_6000', 'freq_8000', ]
ff = equipment_success[unscaled_features].values labels = equipment_success['eq_freq_1000'].values [ff.shape, labels.shape]
len(ff)
bad = {} for i in range(len(ff)): if i % 100 == 0: print(i) print(bad) for j in range(i + 1, len(ff)): if ff[i][1] == ff[j][1] and ff[i][2] == ff[j][2] and abs(ff[i][0] - ff[j][0]) < 120 and np.linalg.norm(ff[i][3:] - ff[j][3:]) < 10 and abs(labels[i] - labels[j]) >= 5: #if ff[i][1] == ff[j][1] and ff[i][2] == ff[j][2] and abs(ff[i][0] - ff[j][0]) < 120 and abs(ff[i][6] - ff[j][6]) <= 5 and abs(labels[i] - labels[j]) > 5: #if abs(ff[i][6] - ff[j][6]) <= 5 and abs(labels[i] - labels[j]) > 5: if bad.get(i) == None: bad[i] = [] bad[i].append(j) bad
[ff[0], labels[0]]
[ff[26], labels[26]]
list(bad.keys())
good_data = np.delete(data, list(bad.keys()), axis=0) good_labels = np.delete(labels, list(bad.keys()), axis=0)
[good_data.shape, good_labels.shape]
3. SVM
3.
SVM
from sklearn import svm
my_features = [ 'center_id', 'years', 'genderNum', #'freq_250', 'freq_500', # 'freq_750', 'freq_1000', # 'freq_1500', 'freq_2000', # 'freq_3000', 'freq_4000', # 'freq_6000', 'freq_8000' ] my_equipment_success = equipment_success.copy() my_equipment_success['valid'] = ( my_equipment_success['eq_freq_1000'].apply(lambda x : 20 <= x <= 60) ) #my_equipment_success = my_equipment_success.query(' valid == True').copy() my_equipment_success = my_equipment_success.sample(frac=1) data = my_equipment_success[my_features].values labels = (my_equipment_success['eq_freq_1000'].values) [data.shape, labels.shape]
n = 10000 clf = svm.SVR(gamma='scale', kernel='rbf', C=4, epsilon=4, shrinking=True) clf.fit(data[0:n], labels[0:n])
predictions = clf.predict(data) print("training") print(np.std(labels[0:n]-np.mean(labels))) print(np.std(labels[0:n]-predictions[0:n])) print(np.std(labels[0:n]-predictions[0:n]) ** 2) print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), 5)) print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], 5)) print("validation") print(np.std(labels[n+1:]-np.mean(labels))) print(np.std(labels[n+1:]-predictions[n+1:])) print(np.std(labels[n+1:]-predictions[n+1:]) ** 2) print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), 5)) print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], 5))
from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV from sklearn.metrics import classification_report from sklearn.svm import SVR
X_train, X_test, y_train, y_test = train_test_split( data, labels, test_size=0.2, random_state=0) tuned_parameters = [{'kernel': ['rbf'], 'gamma': ['scale'], 'C': [4, 6,10], 'epsilon': [ 4, 6, 10], 'shrinking': [True] } ]
print("Generating the models") clf = GridSearchCV(SVR(), tuned_parameters, cv=5, scoring = "neg_mean_squared_error") clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print()
clf.get_params()
4. Gradient boosting
4.
Gradient boosting
import lightgbm as lgb
def model_good_prediction_ratio_diffs_array(diffs, max_distance): predictionsDiff = list(map(lambda x: abs(x), list(diffs))) return len(list(filter(lambda x: x < max_distance, predictionsDiff)))/len(predictionsDiff)
my_equipment_success = equipment_success.copy() #my_equipment_success = my_equipment_success.query(' valid == True').copy() my_equipment_success = my_equipment_success.sample(frac=1)
my_features = [ 'center_id', 'freq_1000', 'decades_old', # 'years', 'genderNum', #'freq_250', 'freq_500', #'freq_750', #'freq_1500', 'freq_2000', #'freq_3000', 'freq_4000', #'freq_6000', #'freq_8000' ] data = my_equipment_success[my_features].values labels = (my_equipment_success['eq_freq_1000'].values) losses = (my_equipment_success['freq_1000'].values) labels_ratio = (my_equipment_success['eq_freq_1000'].values)/losses [data.shape, labels.shape]
center_ids = my_equipment_success['center_id'].values tests_per_center = {} for i in range(len(data)): center_id = center_ids[i] tests_per_center[center_id] = safe_inc(tests_per_center.get(center_id)) labels_per_center = {} for i in range(len(data)): center_id = center_ids[i] labels_per_center[center_id] = safe_append(labels_per_center.get(center_id), labels[i]) mean_labels_per_center = {k: np.mean(v) for k, v in labels_per_center.items()} labels_ratio_per_center = {} for i in range(len(data)): center_id = center_ids[i] labels_ratio_per_center[center_id] = safe_append(labels_ratio_per_center.get(center_id), labels[i]/data[i][1]) mean_labels_ratio_per_center = {k: np.mean(v) for k, v in labels_ratio_per_center.items()}
labels_ratio_scaled_per_center = [] for i in range(len(data)): center_id = center_ids[i] labels_ratio_scaled_per_center.append(labels_ratio[i]/mean_labels_ratio_per_center[center_id])
min_test_per_center = 200 good_indexes = [] for i in range(len(data)): center_id = center_ids[i] if tests_per_center[center_id] > min_test_per_center: good_indexes.append(i) data = np.asarray([data[index] for index in good_indexes]) labels = np.asarray([labels[index] for index in good_indexes]) losses = np.asarray([losses[index] for index in good_indexes]) labels_ratio = np.asarray([labels_ratio[index] for index in good_indexes]) labels_ratio_scaled_per_center = np.asarray([labels_ratio_scaled_per_center[index] for index in good_indexes]) center_ids = np.asarray([center_ids[index] for index in good_indexes]) n = 4000 [data.shape, labels_ratio.shape]
{k:[v,tests_per_center.get(k)] for k,v in mean_labels_ratio_per_center.items() if tests_per_center.get(k) >= min_test_per_center}
Language:Python
train_data = lgb.Dataset(data[0:n], label=labels[0:n], feature_name=my_features, categorical_feature=['genderNum', 'center_id']) param = {'num_leaves':37, # 'max_depth':20, # 'min_gain_to_split': 1000, 'num_trees':85, # 'min_data_in_leaf': 100, # 'max_bin': 100, 'objective':'regression'} num_round = 30 bst = lgb.train(param, train_data, num_round) predictions = bst.predict(data)#*losses # for i in range(len(data)): # center_id = center_ids[i] # predictions[i] = predictions[i]*mean_labels_ratio_per_center[center_id] delta = 5 print("training") print(np.std(labels[0:n]-np.mean(labels))) print(np.std(labels[0:n]-predictions[0:n])) print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), delta)) print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], delta)) print("validation") print(np.std(labels[n+1:]-np.mean(labels))) print(np.std(labels[n+1:]-predictions[n+1:])) print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), delta)) print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], delta))
center_ids = my_equipment_success['center_id'].values diff_per_center = {} low_baseline_diff_per_center = {} baseline_diff_per_center = {} for i in range(0,n):#,len(data)): center_id = data[i][0] diff = (predictions - labels)[i] baseline_diff = labels[i] - mean_labels_per_center[center_id] low_baseline_diff = labels[i] - np.mean(labels) diff_per_center[center_id] = safe_append(diff_per_center.get(center_id), diff) baseline_diff_per_center[center_id] = safe_append(baseline_diff_per_center.get(center_id), baseline_diff) low_baseline_diff_per_center[center_id] = safe_append(low_baseline_diff_per_center.get(center_id), low_baseline_diff)
my_dictionary = {k: [np.std(v), model_good_prediction_ratio_diffs_array(v, delta), len(v)] for k, v in diff_per_center.items()} good_centers = {k: 1 for k,v in my_dictionary.items() if v[1] >= 0} good_centers = {k:v for k,v in my_dictionary.items() if good_centers.get(k) } [sum(list(map(lambda x: x[2], list(my_dictionary.values())))), sum(list(map(lambda x: x[2], list(good_centers.values()))))]
{k:v for k,v in mean_labels_ratio_per_center.items() if tests_per_center.get(k) >= 100}
print("validation on good centers") validation_labels = labels[n+1:] validation_predictions = predictions[n+1:] validation_data = data[n+1:] validataion_center_ids = center_ids[n+1:] validation_predictions_good = [] validation_labels_good = [] for i in range(len(validation_data)): center_id = validataion_center_ids[i] if good_centers.get(center_id): validation_labels_good.append(validation_labels[i]) validation_predictions_good.append(validation_predictions[i]) validation_labels_good = np.asarray(validation_labels_good) validation_predictions_good = np.asarray(validation_predictions_good) print(np.std(validation_labels_good - np.mean(labels))) print(np.std(validation_labels_good - validation_predictions_good)) print(model_good_prediction_ratio_array(validation_labels_good, np.mean(labels), delta)) print(model_good_prediction_ratio_array(validation_labels_good, validation_predictions_good, delta))
my_dictionary = {k: [np.std(v), model_good_prediction_ratio_diffs_array(v, delta), len(v)] for k, v in baseline_diff_per_center.items()} my_dictionary
my_dictionary = {k: [np.std(v), model_good_prediction_ratio_diffs_array(v, delta), len(v)] for k, v in low_baseline_diff_per_center.items()} my_dictionary
predictions = bst.predict(data)*(my_equipment_success['freq_1000'].values) delta = 5 print("training") print(np.std(labels[0:n]-np.mean(labels))) print(np.std(labels[0:n]-predictions[0:n])) print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), delta)) print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], delta)) print("validation") print(np.std(labels[n+1:]-np.mean(labels))) print(np.std(labels[n+1:]-predictions[n+1:])) print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), delta)) print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], delta))
5. Decision Tree
5.
Decision Tree
from sklearn import tree import graphviz
my_features = [ 'center_id', 'decades_old', 'genderNum', #'freq_250', 'freq_500', # 'freq_750', 'freq_1000', # 'freq_1500', 'freq_2000', # 'freq_3000', 'freq_4000', # 'freq_6000', 'freq_8000' ] my_equipment_success = equipment_success.copy() #my_equipment_success = my_equipment_success.query(' valid == True').copy() my_equipment_success = my_equipment_success.sample(frac=1) data = my_equipment_success[my_features].values labels = (my_equipment_success['eq_freq_1000'].values) [data.shape, labels.shape]
5.1. Regression
5.1.
Regression
n = 10000 clf = tree.DecisionTreeRegressor(#min_samples_leaf=2, max_depth=6 # min_impurity_decrease=0.025 ) clf.fit(data[0:n], labels[0:n]) [clf.score(data[0:n], labels[0:n]), clf.score(data[n+1:], labels[n+1:])]
clf.feature_importances_
dot_data = tree.export_graphviz(clf, out_file=None) graph = graphviz.Source(dot_data) graph.render("idris")
predictions = clf.predict(data) print("training") print(np.std(labels[0:n]-np.mean(labels))) print(np.std(labels[0:n]-predictions[0:n])) print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), 5)) print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], 5)) print("validation") print(np.std(labels[n+1:]-np.mean(labels))) print(np.std(labels[n+1:]-predictions[n+1:])) print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), 5)) print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], 5))
from sklearn.ensemble import RandomForestRegressor
clf2 = RandomForestRegressor(n_estimators=40, max_features=7, max_depth=6) #, min_impurity_decrease=0.025)#,min_samples_leaf=10) clf2.fit(data[0:n], labels[0:n]) [clf2.score(data[0:n], labels[0:n]), clf2.score(data[n+1:], labels[n+1:])]
clf2.get_params()
predictions = clf2.predict(data) print("training") print(np.std(labels[0:n]-np.mean(labels))) print(np.std(labels[0:n]-predictions[0:n])) print(model_good_prediction_ratio_array(labels[0:n], np.mean(labels), 5)) print(model_good_prediction_ratio_array(labels[0:n], predictions[0:n], 5)) print("validation") print(np.std(labels[n+1:]-np.mean(labels))) print(np.std(labels[n+1:]-predictions[n+1:])) print(model_good_prediction_ratio_array(labels[n+1:], np.mean(labels), 5)) print(model_good_prediction_ratio_array(labels[n+1:], predictions[n+1:], 5))
5.2. Classifier
5.2.
Classifier
def rounder(t): return int(round(t / 5)) classes = np.array([rounder(xi) for xi in labels]) print(classes) plt.hist(classes) 100*len(list(filter(lambda x : 10 < x < 60, labels)))/len(data)
data[0] = np.array([rounder(xi) for xi in data[0]]) data[0]
set(classes)
from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_estimators=10, max_features=3)#,min_samples_leaf=10) clf2.fit(data[0:4000], classes[0:4000]) [clf2.score(data[0:4000], classes[0:4000]), clf2.score(data[4000:], classes[4000:])]
proba = np.asarray(list(map(lambda x: np.max(x), clf2.predict_proba(data)))) good_indices = np.ndarray.nonzero(proba > 0.8)[0] good_indices.shape training_good_indices = list(filter(lambda x : x < 4000, good_indices)) validation_good_indices = list(filter(lambda x : 5000 > x >= 4000, good_indices)) [len(training_good_indices), len(validation_good_indices)]
[clf2.score(data[training_good_indices], classes[training_good_indices]), clf2.score(data[validation_good_indices], classes[validation_good_indices])]
def classifier_good_prediction_ratio_array(targets, predictions): predictionsDiff = (targets == predictions) return len(list(filter(lambda x: x , predictionsDiff)))/len(predictionsDiff)
clf = tree.DecisionTreeClassifier(#min_samples_leaf=None, min_impurity_decrease=0.0004) clf.fit(data[0:4000], classes[0:4000]) [clf.score(data[0:4000], classes[0:4000]), clf.score(data[4000:], classes[4000:])]
proba = np.asarray(list(map(lambda x: np.max(x), clf.predict_proba(data)))) good_indices = np.ndarray.nonzero(proba > 0.8)[0] good_indices.shape training_good_indices = list(filter(lambda x : x < 4000, good_indices)) validation_good_indices = list(filter(lambda x : 5000 > x >= 4000, good_indices)) [len(training_good_indices), len(validation_good_indices)]
dot_data = tree.export_graphviz(clf, out_file=None) graph = graphviz.Source(dot_data) graph.render("idris")
my_features
clf.feature_importances_
proba = np.asarray(list(map(lambda x: np.max(x), clf.predict_proba(data)))) good_indices = np.ndarray.nonzero(proba > 0.8)[0] good_indices.shape training_good_indices = list(filter(lambda x : x < 4000, good_indices)) validation_good_indices = list(filter(lambda x : 5000 > x >= 4000, good_indices)) [len(training_good_indices), len(validation_good_indices)]
[clf.score(data[training_good_indices], classes[training_good_indices]), clf.score(data[validation_good_indices], classes[validation_good_indices])]
relevant_labels = classes[0:4000] [classifier_good_prediction_ratio_array(relevant_labels, int(np.average(relevant_labels))), clf.score(data[0:4000], classes[0:4000])]
relevant_labels = classes[4000:] [classifier_good_prediction_ratio_array(relevant_labels, int(np.average(relevant_labels))), clf.score(data[4000:], classes[4000:])]
i = validation_good_indices[1] print(i) print(clf.predict_proba(data[[i]])) classes[i]
[labels[4042], data[4042]]
list(map(lambda x: [x, rounder(x)], range(1, 100)))
6. Neural Network
6.
Neural Network
def good_prediction_error(y_true, y_pred): g = tf.subtract(y_true, y_pred) g = tf.cast(g < 5.0, tf.float32) return g #good_prediction_error(np.array([1, 20]), np.array([0, 0]))
features
eq = equipment_success.sample(frac=1) data = (equipment_success[features].values) labels = (equipment_success['eq_freq_1000'].values)
data = original_data.copy() np.random.shuffle(data) data.shape
model = Sequential([ Dense(300, input_dim=len(features)), advanced_activations.LeakyReLU(alpha=0.3), Dense(100), advanced_activations.LeakyReLU(alpha=0.3), Dense(100), advanced_activations.LeakyReLU(alpha=0.3), Dense(1), ]) model.compile(optimizer=optimizers.Adam(lr=0.01), loss='mean_squared_error', metrics=[ 'mean_squared_error']) validation_split=0.2 training_samples = int(len(data)*(1-validation_split)) history = model.fit(data, labels, epochs=200, steps_per_epoch=100, validation_split=validation_split, validation_steps=100, verbose=1)
from operator import itemgetter # ע a, b = min(enumerate(history.history['mean_squared_error']), key=itemgetter(1)) [a, math.sqrt(b)]
relevant_data = data[0:training_samples] relevant_labels = labels[0:training_samples] predictions = model.predict(relevant_data).transpose()[0] [model_good_prediction_ratio_array(relevant_labels, predictions, 5), model_good_prediction_ratio_array(relevant_labels, np.average(relevant_labels), 5), np.std(relevant_labels - predictions), np.std(relevant_labels - np.average(relevant_labels)) ]
relevant_data = data[training_samples:] relevant_labels = labels[training_samples:] predictions = model.predict(relevant_data).transpose()[0] [model_good_prediction_ratio_array(relevant_labels, predictions, 5), model_good_prediction_ratio_array(relevant_labels, np.average(relevant_labels), 5), np.std(relevant_labels - predictions), np.std(relevant_labels - np.average(relevant_labels)) ]
# Plot training & validation loss values plt.plot(history.history['mean_squared_error']) plt.plot(history.history['val_mean_squared_error']) plt.title('Model mean_squared_error') plt.ylabel('mean_squared_error') plt.ylim(0, 100) plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper right') plt.show()
test_dataset = equipment_success_all.copy() #[(equipment_success_valid['isMale'] == 0) # & (equipment_success_valid['decades_old'] == 7) #].copy() test_dataset = test_dataset.sample(frac=1) test_dataset = test_dataset.tail(1000) test_data = (test_dataset[features].values) test_labels = (test_dataset['eq_freq_2000'].values) test_predictions = model.predict(test_data).transpose()[0] [model_good_prediction_ratio_array(test_labels, test_predictions, 5), model_good_prediction_ratio_array(test_labels, np.average(test_labels), 5)]
test_labels - test_predictions