Add files via upload

fireindark707 · web-flow · commit 517fbb07a1db · 2022-04-11T16:52:30.000+08:00
diff --git a/train.py b/train.py
@@ -5,7 +5,6 @@
 import datetime
 from sklearn.metrics import confusion_matrix
 from sklearn.metrics import f1_score, precision_score, recall_score
-import time
 import warnings
 warnings.filterwarnings("ignore")
 
@@ -17,13 +16,13 @@
                 ]
 
 params = {
-        'max_depth': 4,
-        'eta': 0.05,
+        'max_depth': 3,
+        'eta': 0.03,
         'objective': 'binary:logistic',
         'eval_metric': 'auc',
     }
 
-def train(train_features,train_labels,num_round=400):
+def train(train_features,train_labels,num_round=900):
     dtrain = xgb.DMatrix(train_features, label=train_labels)
     bst = xgb.train(params, dtrain, num_round)
     # get best_threshold
@@ -90,19 +89,15 @@ def get_feature_importances(bst):
     importance = sorted(importance, key=lambda x: x[0][1], reverse=True)
     return importance
 
-if __name__ == '__main__':
-    model_save_pth = "model/"+datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
-    if not os.path.exists(model_save_pth):
-        os.makedirs(model_save_pth)
+def train_loop(num_round=900):
     precision_list = []
     recall_list = []
     f1_list = []
     c_matrix_list = []
     feature_importance_list = []
     for i in range(len(os.listdir("Input"))):
-        time.sleep(1)
         train_features, train_labels, test_features, test_labels = preprocess("Input/" + str(i))
-        bst, best_threshold = train(train_features, train_labels)
+        bst, best_threshold = train(train_features, train_labels, num_round)
         precision, recall, f1, c_matrix = test(bst,best_threshold, test_features, test_labels)
         feature_importance = get_feature_importances(bst)
         #print(f"Positive rate in Training: {sum(train_labels)/len(train_labels)*100:.2f}%")
@@ -116,12 +111,8 @@ def get_feature_importances(bst):
         bst.save_model(model_save_pth+f"/{i}.model")
         with open(model_save_pth+f"/{i}.threshold",'w') as f:
             f.write(str(best_threshold))
-    # give evaluation results
-    print("Average Precision: %.2f" % np.mean(precision_list))
-    print("Average Recall: %.2f" % np.mean(recall_list))
-    print("Average F1: %.2f" % np.mean(f1_list))
-    print(f1_list)
-    print(np.mean(c_matrix_list,axis=0))
+    #print(f1_list)
+    #print(np.mean(c_matrix_list,axis=0))
     # evaluate feature importance
     feature_name_importance = {}
     for feature_importance in feature_importance_list:
@@ -131,6 +122,48 @@ def get_feature_importances(bst):
             else:
                 feature_name_importance[feature_name] = im[1]
     feature_name_importance = sorted(feature_name_importance.items(), key=lambda x: x[1], reverse=True)
-    print('feature importance:')
-    for item in feature_name_importance:
-        print(item)
+    return precision_list, recall_list, f1_list, c_matrix_list, feature_name_importance
+
+def optimize_hyperparameter(eta_candid,max_depth_candid,num_round_candid):
+    best_f1 = 0
+    for eta in eta_candid:
+        for max_depth in max_depth_candid:
+            for num_round in num_round_candid:
+                print(eta, max_depth, num_round)
+                params["eta"] = eta
+                params["max_depth"] = max_depth
+                precision_list, recall_list, f1_list, c_matrix_list, feature_name_importance = train_loop(num_round)
+                if np.mean(f1_list) > best_f1:
+                    best_f1 = np.mean(f1_list)
+                    best_params = params
+                    best_precision = np.mean(precision_list)
+                    best_recall = np.mean(recall_list)
+    return best_params, best_precision, best_recall, best_f1
+    
+
+if __name__ == '__main__':
+    model_save_pth = "model/"+datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    if not os.path.exists(model_save_pth):
+        os.makedirs(model_save_pth)
+
+    # tune parameters
+    if False:
+        eta_candidate = [0.3,0.2,0.15,0.1,0.08,0.05,0.03]
+        max_depth_candidate = [5,10,15,20,25,30,35,40,45,50]
+        num_round_candidate = [100,200,300,400,500,600,700,800,900,1000]
+        best_params,best_precision, best_recall, best_f1 = optimize_hyperparameter(eta_candidate,max_depth_candidate,num_round_candidate)
+        print(best_params)
+        print(best_precision)
+        print(best_recall)
+        print(best_f1)
+
+    precision_list, recall_list, f1_list, c_matrix_list, feature_name_importance = train_loop()
+    # give evaluation results
+    print("Average Precision: %.3f" % np.mean(precision_list))
+    print("Average Recall: %.3f" % np.mean(recall_list))
+    print("Average F1: %.3f" % np.mean(f1_list))
+    print(f1_list)
+    print("Average Confusion Matrix: \n", np.mean(c_matrix_list,axis=0))
+    print("Feature Importance:")
+    for importance in feature_name_importance:
+        print(f"{importance[0]}: {importance[1]}")