百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例(Blackhole 1.0.0)
文档简介:
Blackhole是百度自研的高性能数据科学引擎,CodeLab中内嵌了该引擎。通过异构加速计算、超大数据处理、高效数据存储等技术,单机Blackhole在数据分析和机器学习等场景相比开源Pandas/Sklearn性能可提升7倍以上、拥有TB级的单机超大数据处理能力,同时提供和Pandas、Sklearn基本一致的易用接口。
Blackhole 1.0.0
Blackhole是百度自研的高性能数据科学引擎,CodeLab中内嵌了该引擎。通过异构加速计算、超大数据处理、高效数据存储等技术,单机Blackhole在数据分析和机器学习等场景相比开源Pandas/Sklearn性能可提升7倍以上、拥有TB级的单机超大数据处理能力,同时提供和Pandas、Sklearn基本一致的易用接口。参考文档点击这里查看。
本文使用Blackhole中随机森林算法对希格斯玻色子的信号进行预测,并采用准确率评估指标对模型性能进行评估。参考kaggle竞赛-希格斯玻色子机器学习挑战,HIGGS数据集由加利福尼亚大学机器学习与智能系统中心提供,用于预测希格斯玻色子的信号。
训练数据集点击这里下载。
单机训练(计算节点数为1),示例代码如下:
# Blackhole train demo #!/usr/bin/env python # -*- coding: utf-8 -*- """ Copyright (c) 2021 Baidu.com, Inc. All Rights Reserved File: demo.py Blackhole demo 本demo背景参考kaggle竞赛-希格斯玻色子机器学习挑战,HIGGS数据集由加利福尼亚大学机器学习与智能系统中心提供,用于预测希格斯玻色子的信号。 本demo使用blackhole中随机森林算法对希格斯玻色子的信号进行预测,并采用准确率评估指标对模型性能进行评估。 数据集中第1列为标签列,其后28列为特征列. 数据集地址: https://archive.ics.uci.edu/ml/datasets/HIGGS https://codelab-dataset.cdn.bcebos.com/small/competition/higgs.zip 竞赛地址: https://www.kaggle.com/c/higgs-boson/overview """ import os import logging import shutil import blackhole import blackhole.gibbons
as pd from blackhole.ml.metrics import accuracy_score from blackhole.ml.model_selection import train_test_
split from blackhole.ml.ensemble import RandomForestClassifier logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s',
level=logging.INFO) def load_data(csv_file): """ :param csv_file: :return: """ assert os.path.exists(csv_file), "%s not exists" % csv_file logging.info("Load data from %s" % csv_file) col_names = ['label'] + ["col-{}".format(i)
for i in range(2, 30)] # Assign column names data = pd.read_csv(csv_file, names=col_names)
return data def split_data(data, test_ratio=0.3): """ :param data: dataframe :param test_ratio: test data ratio :return: """ logging.info("Split_data, train ratio: %s, split_ratio: %s" % (1.0 - test_ratio,
test_ratio)) X, y = data[data.columns.difference(['label'])], data['label'] # Separate data
into X and y X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=int(len(data)
* test_ratio)) logging.info("After split, train_data: %s, test_data: %s" % (len(X_train),
len(X_test))) train_test_data_dict = { "X_train": X_train, "X_test": X_test, "y_train": y_train,
"y_test": y_test } return train_test_data_dict def create_model(model, params): """ :param model: :param params: :return: """ logging.info("Create model from %s, params: %s" % (model, params)) bh_model = model
(**params) return bh_model def fit(model_instance, X_train, y_train, X_test, y_test): """ :param model_instance: :param X_train: :param y_train: :param X_test: :param y_test: :return: """ logging.info("Fit model...") model_instance.fit(X_train, y_train)
pre = model_instance.predict(X_test) accuracy = accuracy_score(pre, y_test)
logging.info("Train accuracy: %s" % accuracy) return model_instance def save_model(model_instance, output_path): """ :param model_instance: :param output_path: :return: """ shutil.rmtree(output_path, ignore_errors=True) blackhole.ml.save_model(model_instance,
output_path) logging.info("Save model to %s" % output_path) return output_path def predict(model_path, X_test): """ :param model_path: :param X_test: dataframe, should not contain label :return: """ assert os.path.exists(model_path), "%s not exists" % model_path logging.info("Load model from %s, predict ..." % model_path) model = blackhole.ml.
load_model(model_path) pred = model.predict(X_test) logging.info("Predict_result number:
%s, show top 5: \n%s" % (len(pred), pred.head())) return pred def main(): """ :return: File directory example: |-- demo.py |-- output | |-- bhml.meta | `-- bhml.model |-- test_data | `-- HIGGS.csv `-- train_data `-- HIGGS.csv """ # step1, load and split data train_csv_file = "./train_data/HIGGS.csv"
# csv file is in train_data folder train_data = load_data(train_csv_file) train_test_data_dict
= split_data(train_data) X_train = train_test_data_dict['X_train'] X_test = train_test_data_dict
['X_test'] y_train = train_test_data_dict['y_train'] y_test = train_test_data_dict['y_test']
# step 2, create model、fit and save model_params = { 'n_estimators': 25, 'max_depth': 13,
} bh_RandomForestClassifier = create_model(RandomForestClassifier, model_params)
bh_RandomForestClassifier = fit(bh_RandomForestClassifier, X_train, y_train, X_test, y_test)
saved_model_path = "./output/" # output path is in ./output save_model(bh_RandomForestClassifier,
saved_model_path) # step 3, predict test_csv_file = "./test_data/HIGGS.csv"
# csv file is in train_data folder test_data = load_data(test_csv_file) y_predict
= predict(saved_model_path, test_data) if __name__ == "__main__": main()