# -*- coding: utf-8 -*- # Author: # Antonio Lopez-Martinez-Carrasco import sys import pandas as pd from sklearn.tree import DecisionTreeClassifier import pickle _params = { "criterion" : 'gini', "splitter" : 'best', "max_depth" : None, "min_samples_split" : 2, "min_samples_leaf" : 1, "min_weight_fraction_leaf" : 0.0, "max_features" : None, "random_state" : None, "max_leaf_nodes" : None, "min_impurity_decrease" : 0.0, "class_weight" : None, "ccp_alpha" : 0.0, "monotonic_cst" : None } # Class name. class_name = att_name_for_predictions = class_name + "_pred" # Read the input datasets. train_dataset = pd.read_csv(sys.argv[1]) test_dataset = pd.read_csv(sys.argv[2]) # Create the output datasets. train_dataset_with_predictions = train_dataset.copy() test_dataset_with_predictions = test_dataset.copy() # Create the model. random_state_value = or _params["random_state"] model = DecisionTreeClassifier( criterion = _params["criterion"], splitter = _params["splitter"], max_depth = _params["max_depth"], min_samples_split = _params["min_samples_split"], min_samples_leaf = _params["min_samples_leaf"], min_weight_fraction_leaf = _params["min_weight_fraction_leaf"], max_features = _params["max_features"], random_state = random_state_value, max_leaf_nodes = _params["max_leaf_nodes"], min_impurity_decrease = _params["min_impurity_decrease"], class_weight = _params["class_weight"], ccp_alpha = _params["ccp_alpha"], monotonic_cst = _params["monotonic_cst"] ) # Split the train data into X and y. X = train_dataset.drop(columns=[class_name], inplace=False) y = train_dataset[class_name] # Fit. model.fit(X, y) # Predict with the train dataset and add the new attribute. train_dataset_with_predictions[att_name_for_predictions] = pd.Series(model.predict(X)) # Predict with the test dataset and add the new attribute. if class_name in test_dataset.columns: test_dataset_with_predictions[att_name_for_predictions] = pd.Series(model.predict(test_dataset.drop(columns = [class_name], inplace = False))) else: test_dataset_with_predictions[att_name_for_predictions] = pd.Series(model.predict(test_dataset)) # Write the results to disk. train_dataset_with_predictions.to_csv("step2_train_dataset_with_predictions.csv", index = False) test_dataset_with_predictions.to_csv("step2_test_dataset_with_predictions.csv", index = False) model_file = open("step2_model.pickle", "wb") pickle.dump(model, model_file) model_file.close()