diff --git a/configs/common/dbscan.json b/configs/common/dbscan.json new file mode 100644 index 00000000..c40714b5 --- /dev/null +++ b/configs/common/dbscan.json @@ -0,0 +1,27 @@ +{ + "PARAMETERS_SETS": { + "common dbscan parameters": { + "algorithm": { + "estimator": "DBSCAN", + "estimator_params": { + "eps": "[SPECIAL_VALUE]distances_quantile:0.01", + "min_samples": 5, + "metric": "euclidean" + } + } + }, + "sklearn dbscan parameters": { + "algorithm": { + "estimator_params": { + "algorithm": "brute", + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } + } + }, + "cuml dbscan parameters": { + "algorithm": { + "estimator_params": { "calc_core_sample_indices": false, "verbose": 2 } + } + } + } +} diff --git a/configs/common/ensemble.json b/configs/common/ensemble.json new file mode 100644 index 00000000..17d928d9 --- /dev/null +++ b/configs/common/ensemble.json @@ -0,0 +1,64 @@ +{ + "PARAMETERS_SETS": { + "common ensemble params": { + "algorithm": { + "estimator_params": { + "n_estimators": 500, + "max_depth": 12, + "max_samples": 0.8, + "min_samples_split": 5, + "min_samples_leaf": 2, + "min_impurity_decrease": 0.0, + "bootstrap": true, + "random_state": 42 + } + } + }, + "sklearn ensemble classifier params": { + "algorithm": { + "estimator": ["RandomForestClassifier", "ExtraTreesClassifier"], + "estimator_params": { + "criterion": "gini", + "max_features": "sqrt", + "max_leaf_nodes": null, + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } + } + }, + "sklearn ensemble regressor params": { + "algorithm": { + "estimator": ["RandomForestRegressor", "ExtraTreesRegressor"], + "estimator_params": { + "criterion": "squared_error", + "max_features": 1.0, + "max_leaf_nodes": null, + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } + } + }, + "cuml ensemble classifier params": { + "algorithm": { + "estimator": "RandomForestClassifier", + "estimator_params": { + "n_streams": 4, + "split_criterion": "gini", + "max_features": "sqrt", + "max_leaves": -1, + "n_bins": 256 + } + } + }, + "cuml ensemble regressor params": { + "algorithm": { + "estimator": "RandomForestRegressor", + "estimator_params": { + "n_streams": 4, + "split_criterion": "mse", + "max_features": 1.0, + "max_leaves": -1, + "n_bins": 256 + } + } + } + } +} diff --git a/configs/common/kmeans.json b/configs/common/kmeans.json new file mode 100644 index 00000000..c6581d9e --- /dev/null +++ b/configs/common/kmeans.json @@ -0,0 +1,25 @@ +{ + "PARAMETERS_SETS": { + "common kmeans parameters": { + "algorithm": { + "estimator": "KMeans", + "estimator_params": { + "n_clusters": "[SPECIAL_VALUE]auto", + "n_init": 1, + "max_iter": 30, + "tol": 1e-3, + "random_state": 42 + }, + "estimator_methods": { "inference": "predict" } + } + }, + "sklearn kmeans parameters": { + "algorithm": { "estimator_params": { "init": "k-means++", "algorithm": "lloyd" } } + }, + "cuml kmeans parameters": { + "algorithm": { + "estimator_params": { "init": "scalable-k-means++" } + } + } + } +} diff --git a/configs/common/knn.json b/configs/common/knn.json new file mode 100644 index 00000000..056fb229 --- /dev/null +++ b/configs/common/knn.json @@ -0,0 +1,42 @@ +{ + "PARAMETERS_SETS": { + "common knn parameters": { + "algorithm": { + "estimator_params": { + "n_neighbors": [10, 100], + "weights": "uniform" + } + }, + "data": { + "preprocessing_kwargs": { "normalize": true } + } + }, + "sklearn knn parameters": { + "algorithm": { "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } } + }, + "brute knn classification parameters": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } + } + }, + "kd_tree knn classification parameters": { + "algorithm": { + "estimator": "KNeighborsClassifier", + "estimator_params": { "algorithm": "kd_tree", "metric": "minkowski", "p": 2 } + } + }, + "brute knn regression parameters": { + "algorithm": { + "estimator": "KNeighborsRegressor", + "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } + } + }, + "kd_tree knn regression parameters": { + "algorithm": { + "estimator": "KNeighborsRegressor", + "estimator_params": { "algorithm": "kd_tree", "metric": "minkowski", "p": 2 } + } + } + } +} diff --git a/configs/common/lightgbm.json b/configs/common/lightgbm.json new file mode 100644 index 00000000..54242c95 --- /dev/null +++ b/configs/common/lightgbm.json @@ -0,0 +1,45 @@ +{ + "PARAMETERS_SETS": { + "lightgbm implementations": [ + { + "algorithm": { + "device": "cpu", + "estimator_params": { + "boosting_type": "gbdt", + "verbosity": -1, + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + }, + "enable_modelbuilders": false + } + } + ], + "lightgbm binary classification": { + "algorithm": { + "library": "lightgbm", + "estimator": "LGBMClassifier", + "estimator_params": { + "objective": "binary" + } + } + }, + "lightgbm multi classification": { + "algorithm": { + "library": "lightgbm", + "estimator": "LGBMClassifier", + "estimator_params": { + "objective": "multiclass", + "num_classes": "[SPECIAL_VALUE]auto" + } + } + }, + "lightgbm regression": { + "algorithm": { + "library": "lightgbm", + "estimator": "LGBMRegressor", + "estimator_params": { + "objective": "regression" + } + } + } + } +} diff --git a/configs/common/linear_model.json b/configs/common/linear_model.json new file mode 100644 index 00000000..345418fd --- /dev/null +++ b/configs/common/linear_model.json @@ -0,0 +1,56 @@ +{ + "PARAMETERS_SETS": { + "common linear parameters": { + "algorithm": { + "estimator": "LinearRegression", + "estimator_params": { "fit_intercept": true, "copy_X": true } + } + }, + "common ridge parameters": { + "algorithm": { + "estimator": "Ridge", + "estimator_params": { + "fit_intercept": true, + "alpha": 2.0 + } + } + }, + "common lasso parameters": { + "algorithm": { + "estimator": "Lasso", + "estimator_params": { + "fit_intercept": true, + "max_iter": 1000, + "selection": "cyclic", + "alpha": 1e-3, + "tol": 1e-6 + } + } + }, + "common elasticnet parameters": { + "algorithm": { + "estimator": "ElasticNet", + "estimator_params": { + "fit_intercept": true, + "max_iter": 1000, + "selection": "cyclic", + "alpha": 1e-3, + "l1_ratio": 0.9, + "tol": 1e-6 + } + } + }, + "sklearn linear parameters": { + "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } + }, + "sklearn ridge parameters": { + "estimator_params": { "solver": "auto", "tol": 1e-4 } + }, + "cuml L2 parameters": { + "estimator_params": { "solver": "eig" } + }, + "cuml L1 parameters": { + "estimator_params": { "solver": "cd" } + } + } +} diff --git a/configs/common/logreg.json b/configs/common/logreg.json new file mode 100644 index 00000000..20e15aa4 --- /dev/null +++ b/configs/common/logreg.json @@ -0,0 +1,29 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "common logreg parameters": { + "algorithm": { + "estimator": "LogisticRegression", + "estimator_methods": { "inference": "predict" }, + "estimator_params": { + "penalty": "l2", + "tol": 1e-4, + "C": 1.0, + "l1_ratio": null, + "max_iter": 200 + } + } + }, + "sklearn logreg parameters": { + "algorithm": { + "estimator_params": { + "solver": "lbfgs", + "random_state": 42 + } + } + }, + "cuml logreg parameters": { + "algorithm": { "estimator_params": { "solver": "qn" } } + } + } +} diff --git a/configs/common/pca.json b/configs/common/pca.json new file mode 100644 index 00000000..32df40ac --- /dev/null +++ b/configs/common/pca.json @@ -0,0 +1,27 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "pca parameters": { + "algorithm": { + "estimator": "PCA", + "estimator_params": { + "n_components": 3, + "copy": true, + "whiten": false, + "svd_solver": "covariance_eigh", + "tol": 0.0, + "iterated_power": 15, + "random_state": 42 + } + } + }, + "cuml pca parameters": { + "algorithm": { + "estimator_params": { + "svd_solver": "full", + "random_state": "[REMOVE]" + } + } + } + } +} diff --git a/configs/common/sklearn.json b/configs/common/sklearn.json index d7b13188..60018e6f 100644 --- a/configs/common/sklearn.json +++ b/configs/common/sklearn.json @@ -12,13 +12,6 @@ { "library": "sklearnex", "device": ["cpu", "gpu"] } ] }, - "sklearn-ex[preview] implementations": { - "algorithm": [ - { "library": "sklearn", "device": "cpu" }, - { "library": "sklearnex", "device": "cpu" }, - { "library": "sklearnex.preview", "device": ["cpu", "gpu"] } - ] - }, "sklearnex spmd implementation": { "algorithm": { "library": "sklearnex.spmd", @@ -50,7 +43,19 @@ }, "cuml implementation": { "algorithm": { "library": "cuml" }, - "data": { "format": "cudf" } + "data": { "format": "cupy" } + }, + "extended data formats": { + "data": [ + { + "format": "numpy", + "order": "C" + }, + { + "format": "pandas", + "order": "F" + } + ] } } } diff --git a/configs/common/svm.json b/configs/common/svm.json new file mode 100644 index 00000000..ca1e8f2d --- /dev/null +++ b/configs/common/svm.json @@ -0,0 +1,86 @@ +{ + "PARAMETERS_SETS": { + "binary svc implementations": [ + { + "algorithm": [ + { "library": "sklearn", "device": "cpu" }, + { "library": "sklearnex", "device": ["cpu", "gpu"] } + ] + }, + { + "algorithm": { + "library": "cuml", + "estimator_methods": {"inference": "predict"}, + "estimator_params": { "verbose": false, "multiclass_strategy": "ovr" } + }, + "data": { "format": "cupy" } + } + ], + "multi svc implementations": [ + { + "algorithm": { + "library": ["sklearn", "sklearnex"], + "device": "cpu", + "estimator_params": { "decision_function_shape": "ovr" } + } + }, + { + "algorithm": { + "library": "cuml", + "estimator_methods": {"inference": "predict"}, + "estimator_params": { "multiclass_strategy": "ovr" } + }, + "data": { "format": "cupy" } + } + ], + "svr implementations": [ + { + "algorithm": { + "library": ["sklearn", "sklearnex"], + "device": "cpu" + } + }, + { + "algorithm": { + "library": "cuml", + "estimator_methods": {"inference": "predict"} + }, + "data": { "format": "cupy" } + } + ], + "nusvm implementations": { + "algorithm": [ + { "library": "sklearn", "device": "cpu" }, + { "library": "sklearnex", "device": "cpu" } + ] + }, + "common svm parameters": { + "algorithm": { + "estimator_params": { + "kernel": "rbf", + "degree": 3, + "gamma": "scale", + "tol": 1e-3, + "cache_size": 24576, + "max_iter": 10000 + } + }, + "data": { "preprocessing_kwargs": { "normalize": true } } + }, + "svm clsf parameters": { + "algorithm": { "estimator_params": { "random_state": 42 } } + }, + "svc parameters": { + "algorithm": { "estimator": "SVC", "estimator_params": { "C": 1.0 } } + }, + "svr parameters": { + "algorithm": { "estimator": "SVR", "estimator_params": { "C": 1.0 } } + }, + "nusvc parameters": { + "algorithm": { "estimator": "NuSVC", "estimator_params": { "nu": 0.5 } } + }, + "nusvr parameters": { + "algorithm": { "estimator": "NuSVR", "estimator_params": { "nu": 0.5, "C": 1.0 } } + } + } +} diff --git a/configs/common/train_test_split.json b/configs/common/train_test_split.json new file mode 100644 index 00000000..5b35917a --- /dev/null +++ b/configs/common/train_test_split.json @@ -0,0 +1,17 @@ +{ + "INCLUDE": ["../common/sklearn.json"], + "PARAMETERS_SETS": { + "train_test_split parameters": { + "algorithm": { + "function": "train_test_split", + "args_order": "x_train|y_train", + "kwargs": { + "test_size": 0.25, + "random_state": 42, + "shuffle": true + } + }, + "data": { "split_kwargs": { "ignore": true } } + } + } +} diff --git a/configs/common/tsne.json b/configs/common/tsne.json new file mode 100644 index 00000000..897c7e4e --- /dev/null +++ b/configs/common/tsne.json @@ -0,0 +1,33 @@ +{ + "PARAMETERS_SETS": { + "sklearn parameters": { + "algorithm": { + "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } + } + }, + "cuml parameters": { + "algorithm": { + "estimator_params": { "learning_rate_method": "none", "n_neighbors": 91 } + } + }, + "common tsne parameters": { + "algorithm": { + "estimator": "TSNE", + "estimator_params": { + "n_components": 2, + "perplexity": 30.0, + "early_exaggeration": 12.0, + "learning_rate": 200.0, + "n_iter": 1000, + "n_iter_without_progress": 300, + "min_grad_norm": 1e-7, + "metric": "euclidean", + "init": "random", + "random_state": 42, + "method": "barnes_hut", + "angle": 0.5 + } + } + } + } +} diff --git a/configs/common/xgboost.json b/configs/common/xgboost.json index 1eced184..66f1a708 100644 --- a/configs/common/xgboost.json +++ b/configs/common/xgboost.json @@ -4,14 +4,20 @@ { "algorithm": { "device": "cpu", - "estimator_params": { "tree_method": "hist" }, + "estimator_params": { + "tree_method": "hist", + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + }, "enable_modelbuilders": false } }, { "algorithm": { "device": "gpu", - "estimator_params": { "tree_method": "hist" } + "estimator_params": { + "tree_method": "hist", + "n_jobs": "[SPECIAL_VALUE]physical_cpus" + } }, "data": { "format": "cudf" } } diff --git a/configs/regular/dbscan.json b/configs/regular/dbscan.json index 71dcdc9b..1e684212 100644 --- a/configs/regular/dbscan.json +++ b/configs/regular/dbscan.json @@ -1,59 +1,29 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/dbscan.json"], "PARAMETERS_SETS": { - "common dbscan parameters": { - "algorithm": { - "estimator": "DBSCAN", - "estimator_params": { - "eps": "[SPECIAL_VALUE]distances_quantile:0.01", - "min_samples": 5, - "metric": "euclidean" - } - }, - "data": { - "dtype": ["float32", "float64"] - } - }, - "sklearn dbscan parameters": { - "algorithm": { - "estimator_params": { - "algorithm": "brute", - "n_jobs": "[SPECIAL_VALUE]physical_cpus" - } - } - }, - "cuml dbscan parameters": { - "algorithm": { - "estimator_params": { "calc_core_sample_indices": false, "verbose": 2 } - } - }, - "dbscan datasets": [ - { - "data": { "dataset": ["cifar", "mnist"], "split_kwargs": { "train_size": 10000 } } - }, - { - "data": { "dataset": ["sensit", "hepmass"], "split_kwargs": { "train_size": 20000 } } - }, - { - "data": { - "dataset": "road_network", - "preprocessing_kwargs": { "normalize": true }, - "split_kwargs": { "train_size": [20000, 50000] } - } - }, - { - "data": { + "dbscan datasets": { + "data": [ + { "dataset": "cifar", "split_kwargs": { "train_size": 15000 } }, + { "dataset": "mnist", "split_kwargs": { "train_size": 40000 } }, + { "dataset": "sensit", "split_kwargs": { "ignore": true } }, + { "dataset": "susy", "split_kwargs": { "train_size": 100000 } }, + { + "dataset": "skin_segmentation", + "split_kwargs": { "train_size": 100000 }, + "preprocessing_kwargs": { "normalize": true } + }, + { "source": "make_blobs", "generation_kwargs": { - "centers": 20, + "centers": 10, "n_samples": 50000, - "n_features": [4, 16, 64, 256], - "cluster_std": 1.5 + "n_features": [8, 64, 512], + "cluster_std": 2.0 }, "split_kwargs": { "ignore": true } } - } - ] + ] + } }, "TEMPLATES": { "sklearn dbscan": { diff --git a/configs/regular/ensemble.json b/configs/regular/ensemble.json index 56e37e77..28ef9f7d 100644 --- a/configs/regular/ensemble.json +++ b/configs/regular/ensemble.json @@ -1,89 +1,24 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/ensemble.json"], "PARAMETERS_SETS": { - "common ensemble params": { - "algorithm": { - "estimator_params": { - "n_estimators": 200, - "max_depth": 16, - "max_samples": 1.0, - "min_samples_split": 5, - "min_samples_leaf": 2, - "min_impurity_decrease": 0.0, - "bootstrap": true, - "random_state": 42 - } - } - }, - "sklearn ensemble classifier params": { - "algorithm": { - "estimator": ["RandomForestClassifier", "ExtraTreesClassifier"], - "estimator_params": { - "criterion": "gini", - "max_features": "sqrt", - "max_leaf_nodes": null, - "n_jobs": "[SPECIAL_VALUE]physical_cpus" - } - } - }, - "sklearn ensemble regressor params": { - "algorithm": { - "estimator": ["RandomForestRegressor", "ExtraTreesRegressor"], - "estimator_params": { - "criterion": "squared_error", - "max_features": 1.0, - "max_leaf_nodes": null, - "n_jobs": "[SPECIAL_VALUE]physical_cpus" - } - } - }, - "cuml ensemble classifier params": { - "algorithm": { - "estimator": "RandomForestClassifier", - "estimator_params": { - "n_streams": 4, - "split_criterion": "gini", - "max_features": "sqrt", - "max_leaves": -1, - "n_bins": 256 - } - } - }, - "cuml ensemble regressor params": { - "algorithm": { - "estimator": "RandomForestRegressor", - "estimator_params": { - "n_streams": 4, - "split_criterion": "mse", - "max_features": 1.0, - "max_leaves": -1, - "n_bins": 256 - } - } - }, - "ensemble classification data": { + "ensemble classification datasets": { "data": [ - { "dataset": "skin_segmentation", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, - { "dataset": "creditcard", "split_kwargs": { "train_size": 100000, "test_size": null } }, - { "dataset": "a9a", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, - { "dataset": "mnist", "split_kwargs": { "train_size": 20000, "test_size": null } }, - { "dataset": "gisette", "split_kwargs": { "train_size": 5000, "test_size": 2000 } }, - { "dataset": "svhn", "split_kwargs": { "train_size": 10000, "test_size": 10000 } } + { "dataset": "codrnanorm", "split_kwargs": { "train_size": 0.25, "test_size": null } }, + { "dataset": "creditcard", "split_kwargs": { "train_size": 0.25, "test_size": null } }, + { "dataset": "connect", "split_kwargs": { "train_size": 0.5, "test_size": null } }, + { "dataset": "mnist", "split_kwargs": { "train_size": 10000, "test_size": null } }, + { "dataset": "svhn", "split_kwargs": { "train_size": 20000, "test_size": 50000 } }, + { "dataset": "gisette", "split_kwargs": { "train_size": 2000, "test_size": null } } ] }, - "ensemble regression data": { + "ensemble regression datasets": { "data": [ - { - "dataset": "road_network", - "split_kwargs": { - "train_size": 200000, "test_size": null, - "shuffle": true, "random_state": 42 - } - }, - { "dataset": "creditcard", "split_kwargs": { "train_size": 100000, "test_size": null } }, - { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 50000, "test_size": null } }, - { "dataset": "a9a", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } }, - { "dataset": "gisette", "split_kwargs": { "train_size": 5000, "test_size": 2000 } } + { "dataset": "skin_segmentation", "split_kwargs": { "train_size": 0.5, "test_size": null } }, + { "dataset": "codrnanorm", "split_kwargs": { "train_size": 0.25, "test_size": null } }, + { "dataset": "medical_charges_nominal", "split_kwargs": { "train_size": 0.25, "test_size": null } }, + { "dataset": "creditcard", "split_kwargs": { "train_size": 0.5, "test_size": null } }, + { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 0.1, "test_size": null } }, + { "dataset": "gisette", "split_kwargs": { "train_size": 1000, "test_size": null } } ] } }, @@ -93,7 +28,7 @@ "sklearn-ex[cpu,gpu] implementations", "common ensemble params", "sklearn ensemble classifier params", - "ensemble classification data" + "ensemble classification datasets" ] }, "sklearn ensemble regression": { @@ -101,7 +36,7 @@ "sklearn-ex[cpu,gpu] implementations", "common ensemble params", "sklearn ensemble regressor params", - "ensemble regression data" + "ensemble regression datasets" ] }, "cuml ensemble classification": { @@ -109,7 +44,7 @@ "cuml implementation", "common ensemble params", "cuml ensemble classifier params", - "ensemble classification data" + "ensemble classification datasets" ] }, "cuml ensemble regression": { @@ -117,7 +52,7 @@ "cuml implementation", "common ensemble params", "cuml ensemble regressor params", - "ensemble regression data" + "ensemble regression datasets" ] } } diff --git a/configs/regular/kmeans.json b/configs/regular/kmeans.json index bcb7026f..4d9497ca 100644 --- a/configs/regular/kmeans.json +++ b/configs/regular/kmeans.json @@ -1,70 +1,51 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/kmeans.json"], "PARAMETERS_SETS": { - "common kmeans parameters": { - "algorithm": { - "estimator": "KMeans", - "estimator_params": { - "n_clusters": "[SPECIAL_VALUE]auto", - "n_init": 1, - "max_iter": 30, - "tol": 1e-3, - "random_state": 42 - }, - "estimator_methods": { "inference": "predict" } - }, - "data": { - "dtype": ["float32", "float64"], - "preprocessing_kwargs": { "normalize": true } - } - }, - "sklearn kmeans parameters": { - "algorithm": { "estimator_params": { "init": "k-means++", "algorithm": "lloyd" } } - }, - "cuml kmeans parameters": { - "algorithm": { - "estimator_params": { "init": "scalable-k-means++" } - } - }, "kmeans datasets": [ { "data": [ { - "dataset": ["covtype", "sensit"], - "split_kwargs": { "ignore": true } + "dataset": "covtype", + "split_kwargs": { "ignore": true }, + "preprocessing_kwargs": { "normalize": true } }, { "dataset": ["mnist", "gisette"], - "split_kwargs": { "ignore": true }, - "preprocessing_kwargs": { "normalize": false } + "split_kwargs": { "ignore": true } + }, + { + "dataset": "cifar", + "split_kwargs": { "train_size": 10000, "test_size": null } } ] }, { "data": { - "dataset": "higgs", + "dataset": "hepmass", "split_kwargs": { - "train_size": 100000, - "test_size": 2000000, + "train_size": 2000000, + "test_size": null, "shuffle": true, "random_state": 42 - } - }, - "algorithm": [ - { - "estimator_params": { - "n_clusters": 100, - "max_iter": 10 - } }, - { - "estimator_params": { - "n_clusters": 10, - "max_iter": 100 - } - } - ] + "preprocessing_kwargs": { "normalize": true } + }, + "algorithm": { + "estimator_params": { "n_clusters": [2, 50] } + } + }, + { + "data": { + "source": "make_blobs", + "generation_kwargs": [ + { "centers": 20, "cluster_std": 4.0, "n_samples": 20000000, "n_features": 10 }, + { "centers": 20, "cluster_std": 8.0, "n_samples": 2000000, "n_features": 100 }, + { "centers": 20, "cluster_std": 24.0, "n_samples": 500000, "n_features": 400 } + ], + "split_kwargs": { "train_size": 0.2, "test_size": null } + } } + ] }, "TEMPLATES": { diff --git a/configs/regular/knn.json b/configs/regular/knn.json index e1cd8a75..c4c971ec 100644 --- a/configs/regular/knn.json +++ b/configs/regular/knn.json @@ -1,72 +1,59 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/knn.json"], "PARAMETERS_SETS": { - "common knn parameters": { - "algorithm": { - "estimator_params": { - "n_neighbors": [10, 100], - "weights": "uniform" - } - }, - "data": { - "preprocessing_kwargs": { "normalize": true } - } - }, - "sklearn knn parameters": { - "algorithm": { "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } } - }, - "brute knn algorithm - classification data": { - "algorithm": { - "estimator": "KNeighborsClassifier", - "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } - }, + "brute knn classification datasets": { "data": [ - { "dataset": "susy", "split_kwargs": { "train_size": 100000, "test_size": 10000 } }, + { "dataset": "susy", "split_kwargs": { "train_size": 80000, "test_size": 20000 } }, { "dataset": "connect" }, { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } } ] }, - "kd_tree knn algorithm - classification data": { - "algorithm": { - "estimator": "KNeighborsClassifier", - "estimator_params": { "algorithm": "kd_tree", "metric": "minkowski", "p": 2 } - }, - "data": { - "source": "make_classification", - "generation_kwargs": { - "n_classes": 5, - "n_samples": [50000, 250000], - "n_features": [8, 16], - "n_informative": "[SPECIAL_VALUE]0.5" + "kd_tree knn classification datasets": { + "data": [ + { + "dataset": ["skin_segmentation", "codrnanorm"], + "split_kwargs": { "train_size": 0.25, "test_size": 0.75 } }, - "split_kwargs": { "train_size": 0.8, "test_size": 0.2 } - } + { + "source": "make_classification", + "generation_kwargs": [ + { + "n_classes": 5, "n_samples": 400000, "n_features": 4, + "n_redundant": 0, "n_repeated": 0, "n_informative": 4 + }, + { + "n_classes": 5, "n_samples": 200000, "n_features": 8, + "n_redundant": 2, "n_repeated": 2, "n_informative": 4 + }, + { + "n_classes": 5, "n_samples": 100000, "n_features": 16, + "n_redundant": 6, "n_repeated": 6, "n_informative": 4 + } + ], + "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } + } + ] }, - "brute knn algorithm - regression data": { - "algorithm": { - "estimator": "KNeighborsRegressor", - "estimator_params": { "algorithm": "brute", "metric": "minkowski", "p": [1, 2] } - }, + "brute knn regression datasets": { "data": [ { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 40000, "test_size": 10000 } }, { "dataset": ["fried", "twodplanes"] } ] }, - "kd_tree knn algorithm - regression data": { - "algorithm": { - "estimator": "KNeighborsRegressor", - "estimator_params": { "algorithm": "kd_tree", "metric": "minkowski", "p": 2 } - }, + "kd_tree knn regression datasets": { "data": [ - { "dataset": "fried" }, + { + "dataset": "medical_charges_nominal", + "split_kwargs": { "train_size": 0.75, "test_size": 0.25 } + }, { "source": "make_regression", - "generation_kwargs": { - "n_samples": [50000, 250000], - "n_features": [8, 16], - "noise": 0.75 - }, - "split_kwargs": { "train_size": 0.8, "test_size": 0.2 } + "generation_kwargs":[ + { "n_samples": 400000, "n_features": 4, "noise": 1.0 }, + { "n_samples": 200000, "n_features": 8, "noise": 1.5 }, + { "n_samples": 100000, "n_features": 16, "noise": 2.0 } + ], + "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } } ] } @@ -77,7 +64,8 @@ "sklearn-ex[cpu,gpu] implementations", "common knn parameters", "sklearn knn parameters", - "brute knn algorithm - classification data" + "brute knn classification parameters", + "brute knn classification datasets" ] }, "sklearn kd_tree knn clsf": { @@ -85,7 +73,8 @@ "sklearn-ex[cpu] implementations", "common knn parameters", "sklearn knn parameters", - "kd_tree knn algorithm - classification data" + "kd_tree knn classification parameters", + "kd_tree knn classification datasets" ] }, "sklearn brute knn regr": { @@ -93,7 +82,8 @@ "sklearn-ex[cpu,gpu] implementations", "common knn parameters", "sklearn knn parameters", - "brute knn algorithm - regression data" + "brute knn regression parameters", + "brute knn regression datasets" ] }, "sklearn kd_tree knn regr": { @@ -101,21 +91,24 @@ "sklearn-ex[cpu] implementations", "common knn parameters", "sklearn knn parameters", - "kd_tree knn algorithm - regression data" + "kd_tree knn regression parameters", + "kd_tree knn regression datasets" ] }, "cuml brute knn clsf": { "SETS": [ "cuml implementation", "common knn parameters", - "brute knn algorithm - classification data" + "brute knn classification parameters", + "brute knn classification datasets" ] }, "cuml brute knn regr": { "SETS": [ "cuml implementation", "common knn parameters", - "brute knn algorithm - regression data" + "brute knn regression parameters", + "brute knn regression datasets" ] } } diff --git a/configs/regular/lightgbm.json b/configs/regular/lightgbm.json new file mode 100644 index 00000000..237c1386 --- /dev/null +++ b/configs/regular/lightgbm.json @@ -0,0 +1,31 @@ +{ + "INCLUDE": [ + "../common/lightgbm.json", + "xgboost_binary.json", + "xgboost_multi.json", + "xgboost_regression.json" + ], + "TEMPLATES": { + "lightgbm binary classification": { + "SETS": [ + "lightgbm binary classification", + "lightgbm implementations", + "gbt binary classification data" + ] + }, + "lightgbm multi classification": { + "SETS": [ + "lightgbm multi classification", + "lightgbm implementations", + "gbt multi classification data" + ] + }, + "lightgbm regression": { + "SETS": [ + "lightgbm regression", + "lightgbm implementations", + "gbt regression data" + ] + } + } +} diff --git a/configs/regular/linear_model.json b/configs/regular/linear_model.json index eb1b79ba..154f9004 100644 --- a/configs/regular/linear_model.json +++ b/configs/regular/linear_model.json @@ -1,17 +1,25 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/linear_model.json"], "PARAMETERS_SETS": { "regression datasets": [ { "data": { "source": "make_regression", "split_kwargs": { "train_size": 0.2, "test_size": 0.8 }, - "generation_kwargs": { - "n_samples": 500000, - "n_features": [400, 2000], - "n_informative": 5, - "noise": 2.0 - } + "generation_kwargs": [ + { + "n_samples": 5000000, "n_features": 50, + "n_informative": 5, "noise": 20.0 + }, + { + "n_samples": 500000, "n_features": 400, + "n_informative": 5, "noise": 40.0 + }, + { + "n_samples": 100000, "n_features": 2000, + "n_informative": 5, "noise": 60.0 + } + ] } }, { @@ -24,63 +32,10 @@ { "data": { "dataset": ["hepmass", "susy"], - "split_kwargs": { "train_size": 1000000, "test_size": null } + "split_kwargs": { "train_size": 2000000, "test_size": null } } } - ], - "common linear parameters": { - "algorithm": { - "estimator": "LinearRegression", - "estimator_params": { "fit_intercept": true, "copy_X": true } - } - }, - "common ridge parameters": { - "algorithm": { - "estimator": "Ridge", - "estimator_params": { - "fit_intercept": true, - "alpha": 2.0, - "tol": 1e-4 - } - } - }, - "common lasso parameters": { - "algorithm": { - "estimator": "Lasso", - "estimator_params": { - "fit_intercept": true, - "max_iter": 1000, - "selection": "cyclic", - "alpha": 1e-3, - "tol": 1e-4 - } - } - }, - "common elasticnet parameters": { - "algorithm": { - "estimator": "ElasticNet", - "estimator_params": { - "fit_intercept": true, - "max_iter": 1000, - "selection": "cyclic", - "alpha": 1e-3, - "l1_ratio": 0.9, - "tol": 1e-4 - } - } - }, - "sklearn linear parameters": { - "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } - }, - "sklearn ridge parameters": { - "estimator_params": { "solver": "auto" } - }, - "cuml L2 parameters": { - "estimator_params": { "solver": "eig" } - }, - "cuml L1 parameters": { - "estimator_params": { "solver": "cd" } - } + ] }, "TEMPLATES": { "sklearn linear": { diff --git a/configs/regular/logreg.json b/configs/regular/logreg.json index a94a7fcf..d7bd2064 100644 --- a/configs/regular/logreg.json +++ b/configs/regular/logreg.json @@ -1,54 +1,44 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/logreg.json"], "PARAMETERS_SETS": { - "common logreg parameters": { - "algorithm": { - "estimator": "LogisticRegression", - "estimator_methods": { "inference": "predict" }, - "estimator_params": { - "penalty": "l2", - "tol": 1e-4, - "C": 1.0, - "l1_ratio": null, - "max_iter": 200 - } - } - }, - "sklearn logreg parameters": { - "algorithm": { - "estimator_params": { - "solver": "lbfgs", - "n_jobs": "[SPECIAL_VALUE]physical_cpus", - "random_state": 42 - } - } - }, - "cuml logreg parameters": { - "algorithm": { "estimator_params": { "solver": "qn" } } - }, "logreg datasets": [ { "data": { "source": "make_classification", - "generation_kwargs": { - "n_samples": 200000, - "n_features": [50, 500], - "n_classes": [2, 5], - "n_informative": "[SPECIAL_VALUE]0.5", - "class_sep": 0.75 - }, + "generation_kwargs": [ + { + "n_samples": 100000000, + "n_features": 5, + "n_classes": [2, 5], + "n_informative": 5, + "n_redundant": 0, + "class_sep": 1.0 + }, + { + "n_samples": 10000000, + "n_features": 50, + "n_classes": [2, 5], + "n_informative": "[SPECIAL_VALUE]0.6", + "class_sep": 1.0 + }, + { + "n_samples": 1000000, + "n_features": 500, + "n_classes": [2, 5], + "n_informative": "[SPECIAL_VALUE]0.6", + "class_sep": 1.0 + } + ], "split_kwargs": { - "train_size": 0.5, - "test_size": 0.5 + "train_size": 0.05, + "test_size": 0.95 } } }, - { "data": { "dataset": "mnist", "split_kwargs": { "train_size": 20000, "test_size": 50000 } } }, - { "data": { "dataset": "susy", "split_kwargs": { "train_size": 0.2, "test_size": 0.8 } } }, - { "data": { "dataset": "cifar", "split_kwargs": { "train_size": 0.5, "test_size": 0.5 } } }, - { "data": { "dataset": "klaverjas" } }, - { "data": { "dataset": "gisette" } }, - { "data": { "dataset": "skin_segmentation" } } + { "data": { "dataset": "mnist", "split_kwargs": { "train_size": 10000, "test_size": null } } }, + { "data": { "dataset": ["susy", "hepmass"], "split_kwargs": { "train_size": 0.1, "test_size": null } } }, + { "data": { "dataset": "cifar", "split_kwargs": { "train_size": 0.1, "test_size": null } } }, + { "data": { "dataset": "gisette", "split_kwargs": { "train_size": 2000, "test_size": null } } } ] }, "TEMPLATES": { diff --git a/configs/regular/pca.json b/configs/regular/pca.json index 582acc9e..98c5c706 100644 --- a/configs/regular/pca.json +++ b/configs/regular/pca.json @@ -1,44 +1,27 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/pca.json"], "PARAMETERS_SETS": { - "pca parameters": { - "algorithm": { - "estimator": "PCA", - "estimator_params": { - "n_components": 3, - "copy": true, - "whiten": false, - "svd_solver": "covariance_eigh", - "tol": 0.0, - "iterated_power": 15, - "random_state": 42 - } - } - }, "pca datasets": [ - { - "data": { - "dataset": ["cifar", "mnist", "yolanda"], - "split_kwargs": { "ignore": true } - } - }, { "data": { "source": "make_blobs", - "generation_kwargs": { "n_samples": 20000, "n_features": 2000, "centers": 2 }, - "split_kwargs": { "train_size": 0.1, "test_size": 0.9 } + "generation_kwargs": [ + { "n_samples": 40000000, "n_features": 5, "centers": 2 }, + { "n_samples": 10000000, "n_features": 20, "centers": 2 } + ], + "split_kwargs": { "ignore": true } } }, { "data": { - "dataset": "epsilon", - "split_kwargs": { "train_size": 50000, "test_size": null } + "dataset": "sift", + "split_kwargs": { "ignore": true } } }, { "data": { - "dataset": "higgs", - "split_kwargs": { "train_size": 1000000, "test_size": null } + "dataset": ["gist", "svhn"], + "split_kwargs": { "train_size": 0.25, "test_size": 0.75 } } } ] @@ -55,6 +38,7 @@ "SETS": [ "cuml implementation", "pca parameters", + "cuml pca parameters", "pca datasets" ] } diff --git a/configs/regular/svm.json b/configs/regular/svm.json index 177b3d56..f83e1be1 100644 --- a/configs/regular/svm.json +++ b/configs/regular/svm.json @@ -1,88 +1,7 @@ { + "INCLUDE": ["../common/sklearn.json", "../common/svm.json"], "PARAMETERS_SETS": { - "binary svc implementations": [ - { - "algorithm": [ - { "library": "sklearn", "device": "cpu" }, - { "library": "sklearnex", "device": ["cpu", "gpu"] } - ] - }, - { - "algorithm": { - "library": "cuml", - "estimator_methods": {"inference": "predict"}, - "estimator_params": { "verbose": false, "multiclass_strategy": "ovr" } - }, - "data": { "format": "cudf" } - } - ], - "multi svc implementations": [ - { - "algorithm": { - "library": ["sklearn", "sklearnex"], - "device": "cpu", - "estimator_params": { "decision_function_shape": "ovr" } - } - }, - { - "algorithm": { - "library": "cuml", - "estimator_methods": {"inference": "predict"}, - "estimator_params": { "multiclass_strategy": "ovr" } - }, - "data": { "format": "cudf" } - } - ], - "svr implementations": [ - { - "algorithm": { - "library": ["sklearn", "sklearnex"], - "device": "cpu" - } - }, - { - "algorithm": { - "library": "cuml", - "estimator_methods": {"inference": "predict"} - }, - "data": { "format": "cudf" } - } - ], - "nusvm implementations": { - "algorithm": [ - { "library": "sklearn", "device": "cpu" }, - { "library": "sklearnex", "device": "cpu" } - ] - }, - "common svm parameters": { - "algorithm": { - "estimator_params": { - "kernel": "rbf", - "degree": 3, - "gamma": "scale", - "tol": 1e-3, - "cache_size": 16384, - "max_iter": 10000 - } - }, - "data": { "preprocessing_kwargs": { "normalize": true } } - }, - "svm clsf parameters": { - "algorithm": { "estimator_params": { "random_state": 42 } } - }, - "svc parameters": { - "algorithm": { "estimator": "SVC", "estimator_params": { "C": 1.0 } } - }, - "svr parameters": { - "algorithm": { "estimator": "SVR", "estimator_params": { "C": 1.0 } } - }, - "nusvc parameters": { - "algorithm": { "estimator": "NuSVC", "estimator_params": { "nu": 0.5 } } - }, - "nusvr parameters": { - "algorithm": { "estimator": "NuSVR", "estimator_params": { "nu": 0.5, "C": 1.0 } } - }, - "svc binary data": [ + "svc binary datasets": [ { "data": { "dataset": "a9a", "split_kwargs": { "train_size": 5000, "test_size": null } }, "algorithm": { "estimator_params": { "C": 1.0, "kernel": "linear" } } @@ -106,7 +25,7 @@ } } ], - "svc multiclass data": [ + "svc multiclass datasets": [ { "data": { "dataset": "connect", "split_kwargs": { "train_size": 20000, "test_size": null } }, "algorithm": { "estimator_params": { "C": 10.0, "kernel": ["poly", "rbf"] } } @@ -120,7 +39,7 @@ "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } } ], - "svr data": [ + "svr datasets": [ { "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } }, "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } } @@ -150,7 +69,7 @@ "algorithm": { "estimator_params": { "C": 0.1, "kernel": "linear" } } } ], - "nusvc data": [ + "nusvc datasets": [ { "data": { "dataset": "a9a", "split_kwargs": { "train_size": 5000, "test_size": null } }, "algorithm": { "estimator_params": { "nu": 0.1, "kernel": ["poly", "rbf"] } } @@ -168,7 +87,7 @@ "algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } } } ], - "nusvr data": [ + "nusvr datasets": [ { "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 20000, "test_size": null } }, "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } } @@ -206,7 +125,7 @@ "common svm parameters", "svm clsf parameters", "svc parameters", - "svc binary data" + "svc binary datasets" ] }, "svc multiclass": { @@ -215,7 +134,7 @@ "common svm parameters", "svm clsf parameters", "svc parameters", - "svc multiclass data" + "svc multiclass datasets" ] }, "svr": { @@ -223,7 +142,7 @@ "svr implementations", "common svm parameters", "svr parameters", - "svr data" + "svr datasets" ] }, "nusvc": { @@ -232,7 +151,7 @@ "common svm parameters", "svm clsf parameters", "nusvc parameters", - "nusvc data" + "nusvc datasets" ] }, "nusvr": { @@ -240,7 +159,7 @@ "nusvm implementations", "common svm parameters", "nusvr parameters", - "nusvr data" + "nusvr datasets" ] } } diff --git a/configs/regular/train_test_split.json b/configs/regular/train_test_split.json index 134d9e4e..607a8f26 100644 --- a/configs/regular/train_test_split.json +++ b/configs/regular/train_test_split.json @@ -1,33 +1,28 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/train_test_split.json"], "PARAMETERS_SETS": { - "train_test_split parameters": { - "algorithm": { - "function": "train_test_split", - "args_order": "x_train|y_train", - "kwargs": { - "test_size": 0.25, - "random_state": 42, - "shuffle": true - } - } - }, "train_test_split datasets": [ { "data": { - "dataset": "hepmass", - "split_kwargs": { - "train_size": [100000, 1000000, 10000000], - "test_size": null - } + "dataset": [ + "road_network", + "codrnanorm", + "susy", + "sift", + "gist", + "epsilon", + "svhn" + ] } }, { "data": { - "dataset": ["a9a", "mnist", "cifar", "gisette"], - "split_kwargs": [ - { "train_size": 0.4 }, - { "ignore": true } + "source": "make_regression", + "generation_kwargs": [ + { "n_samples": 20000000, "n_features": 5 }, + { "n_samples": 5000000, "n_features": 50 }, + { "n_samples": 1000000, "n_features": 500 }, + { "n_samples": 100000, "n_features": 5000 } ] } } diff --git a/configs/regular/tsne.json b/configs/regular/tsne.json index 135ebc16..0be7a890 100644 --- a/configs/regular/tsne.json +++ b/configs/regular/tsne.json @@ -1,35 +1,6 @@ { - "INCLUDE": ["../common/sklearn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/tsne.json"], "PARAMETERS_SETS": { - "sklearn parameters": { - "algorithm": { - "estimator_params": { "n_jobs": "[SPECIAL_VALUE]physical_cpus" } - } - }, - "cuml parameters": { - "algorithm": { - "estimator_params": { "learning_rate_method": "none", "n_neighbors": 91 } - } - }, - "common tsne parameters": { - "algorithm": { - "estimator": "TSNE", - "estimator_params": { - "n_components": 2, - "perplexity": 30.0, - "early_exaggeration": 12.0, - "learning_rate": 200.0, - "n_iter": 1000, - "n_iter_without_progress": 300, - "min_grad_norm": 1e-7, - "metric": "euclidean", - "init": "random", - "random_state": 42, - "method": "barnes_hut", - "angle": 0.5 - } - } - }, "tsne datasets": [ { "data": { diff --git a/configs/regular/xgboost_binary.json b/configs/regular/xgboost_binary.json index 170d41ef..0966b70e 100644 --- a/configs/regular/xgboost_binary.json +++ b/configs/regular/xgboost_binary.json @@ -1,7 +1,7 @@ { "INCLUDE": ["../common/xgboost.json"], "PARAMETERS_SETS": { - "binary classification data": [ + "gbt binary classification data": [ { "data": { "dataset": "airline_depdelay", @@ -103,7 +103,7 @@ "SETS": [ "xgboost binary classification", "xgboost implementations", - "binary classification data" + "gbt binary classification data" ] } } diff --git a/configs/regular/xgboost_multi.json b/configs/regular/xgboost_multi.json index 4552e05d..d56e9220 100644 --- a/configs/regular/xgboost_multi.json +++ b/configs/regular/xgboost_multi.json @@ -1,7 +1,7 @@ { "INCLUDE": ["../common/xgboost.json"], "PARAMETERS_SETS": { - "multiclassification data": [ + "gbt multi classification data": [ { "data": { "dataset": "letters", @@ -66,7 +66,7 @@ "SETS": [ "xgboost multiclassification", "xgboost implementations", - "multiclassification data" + "gbt multi classification data" ] } } diff --git a/configs/regular/xgboost_regression.json b/configs/regular/xgboost_regression.json index adffeebd..5ad5f02f 100644 --- a/configs/regular/xgboost_regression.json +++ b/configs/regular/xgboost_regression.json @@ -1,7 +1,7 @@ { "INCLUDE": ["../common/xgboost.json"], "PARAMETERS_SETS": { - "regression data": [ + "gbt regression data": [ { "data": { "dataset": "twodplanes", @@ -97,7 +97,7 @@ "SETS": [ "xgboost regression", "xgboost implementations", - "regression data" + "gbt regression data" ] } } diff --git a/configs/spmd/dbscan.json b/configs/spmd/dbscan.json index b3a039c8..d1872fd7 100644 --- a/configs/spmd/dbscan.json +++ b/configs/spmd/dbscan.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json", "../regular/dbscan.json"], + "INCLUDE": ["../common/sklearn.json", "../common/dbscan.json", "../regular/dbscan.json"], "PARAMETERS_SETS": { "spmd dbscan parameters": {} }, diff --git a/configs/spmd/ensemble.json b/configs/spmd/ensemble.json index da8e7036..c36e67c6 100644 --- a/configs/spmd/ensemble.json +++ b/configs/spmd/ensemble.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json", "../regular/ensemble.json"], + "INCLUDE": ["../common/sklearn.json", "../common/ensemble.json", "../regular/ensemble.json"], "PARAMETERS_SETS": { "spmd ensemble classifier params": { "algorithm": { diff --git a/configs/spmd/kmeans.json b/configs/spmd/kmeans.json index f9e8bb75..6b3423d4 100644 --- a/configs/spmd/kmeans.json +++ b/configs/spmd/kmeans.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json", "../regular/kmeans.json"], + "INCLUDE": ["../common/sklearn.json", "../common/kmeans.json", "../regular/kmeans.json"], "PARAMETERS_SETS": { "spmd kmeans parameters": {} }, diff --git a/configs/spmd/knn.json b/configs/spmd/knn.json index f64d26b5..554cf7f6 100644 --- a/configs/spmd/knn.json +++ b/configs/spmd/knn.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json", "../regular/knn.json"], + "INCLUDE": ["../common/sklearn.json", "../common/knn.json", "../regular/knn.json"], "PARAMETERS_SETS": { "spmd knn parameters": { "algorithm": { diff --git a/configs/spmd/linear_model.json b/configs/spmd/linear_model.json index 03058374..c00944c7 100644 --- a/configs/spmd/linear_model.json +++ b/configs/spmd/linear_model.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json", "../regular/linear_model.json"], + "INCLUDE": ["../common/sklearn.json", "../common/linear_model.json", "../regular/linear_model.json"], "PARAMETERS_SETS": { "spmd linear parameters": {} }, diff --git a/configs/spmd/logreg.json b/configs/spmd/logreg.json index 1c825ffa..b9d0e4bb 100644 --- a/configs/spmd/logreg.json +++ b/configs/spmd/logreg.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json", "../regular/logreg.json"], + "INCLUDE": ["../common/sklearn.json", "../common/logreg.json", "../regular/logreg.json"], "PARAMETERS_SETS": { "spmd logreg parameters": { "algorithm": { diff --git a/configs/spmd/pca.json b/configs/spmd/pca.json index aa3cb15c..7419a350 100644 --- a/configs/spmd/pca.json +++ b/configs/spmd/pca.json @@ -1,5 +1,5 @@ { - "INCLUDE": ["../common/sklearn.json", "../regular/pca.json"], + "INCLUDE": ["../common/sklearn.json", "../common/pca.json", "../regular/pca.json"], "PARAMETERS_SETS": { "spmd pca parameters": { "algorithm": { diff --git a/configs/weekly/dbscan.json b/configs/weekly/dbscan.json new file mode 100644 index 00000000..19406055 --- /dev/null +++ b/configs/weekly/dbscan.json @@ -0,0 +1,50 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/dbscan.json"], + "PARAMETERS_SETS": { + "high-load dbscan datasets": { + "data": [ + { + "dataset": ["cifar", "road_network", "covtype"], + "split_kwargs": { "ignore": true }, + "preprocessing_kwargs": { "normalize": true } + }, + { + "dataset": "susy", + "split_kwargs": { "train_size": 800000 }, + "preprocessing_kwargs": { "normalize": true } + }, + { + "source": "make_blobs", + "generation_kwargs": [ + { "centers": 20, "n_samples": 1000000, "n_features": 16, "cluster_std": 2.0 }, + { "centers": 60, "n_samples": 200000, "n_features": 1024, "cluster_std": 10.0 } + ], + "split_kwargs": { "ignore": true } + }, + { + "source": "make_moons", + "generation_kwargs": { "n_samples": 1000000, "noise": 0.05 }, + "split_kwargs": { "ignore": true } + } + ] + } + }, + "TEMPLATES": { + "sklearn dbscan": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common dbscan parameters", + "sklearn dbscan parameters", + "high-load dbscan datasets" + ] + }, + "cuml dbscan": { + "SETS": [ + "cuml implementation", + "common dbscan parameters", + "cuml dbscan parameters", + "high-load dbscan datasets" + ] + } + } +} diff --git a/configs/weekly/ensemble.json b/configs/weekly/ensemble.json new file mode 100644 index 00000000..69e6d1d9 --- /dev/null +++ b/configs/weekly/ensemble.json @@ -0,0 +1,65 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/ensemble.json"], + "PARAMETERS_SETS": { + "high-load ensemble params": { + "n_estimators": 2000, + "max_depth": null + }, + "high-load ensemble classification datasets": { + "data": [ + { "dataset": "codrnanorm", "split_kwargs": { "ignore": true } }, + { "dataset": "creditcard", "split_kwargs": { "ignore": true } }, + { "dataset": "connect", "split_kwargs": { "ignore": true } }, + { "dataset": "mnist", "split_kwargs": { "ignore": true } }, + { "dataset": "svhn", "split_kwargs": { "ignore": true } }, + { "dataset": "gisette", "split_kwargs": { "ignore": true } } + ] + }, + "high-load ensemble regression datasets": { + "data": [ + { "dataset": "skin_segmentation", "split_kwargs": { "ignore": true } }, + { "dataset": "codrnanorm", "split_kwargs": { "ignore": true } }, + { "dataset": "medical_charges_nominal", "split_kwargs": { "ignore": true } }, + { "dataset": "creditcard", "split_kwargs": { "ignore": true } }, + { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 0.5, "test_size": null } }, + { "dataset": "gisette", "split_kwargs": { "ignore": true } } + ] + } + }, + "TEMPLATES": { + "sklearn ensemble classification": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common ensemble params", + "sklearn ensemble classifier params", + "high-load ensemble classification datasets" + ] + }, + "sklearn ensemble regression": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common ensemble params", + "sklearn ensemble regressor params", + "high-load ensemble regression datasets" + ] + }, + "cuml ensemble classification": { + "SETS": [ + "cuml implementation", + "common ensemble params", + "cuml ensemble classifier params", + "high-load ensemble params", + "high-load ensemble classification datasets" + ] + }, + "cuml ensemble regression": { + "SETS": [ + "cuml implementation", + "common ensemble params", + "cuml ensemble regressor params", + "high-load ensemble params", + "high-load ensemble regression datasets" + ] + } + } +} diff --git a/configs/weekly/kmeans.json b/configs/weekly/kmeans.json new file mode 100644 index 00000000..c27af898 --- /dev/null +++ b/configs/weekly/kmeans.json @@ -0,0 +1,65 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/kmeans.json"], + "PARAMETERS_SETS": { + "high-load kmeans parameters": { + "algorithm": { + "estimator_params": { + "max_iter": 200, + "tol": 1e-6, + "n_clusters": [10, 50, 200] + } + } + }, + "high-load kmeans datasets": [ + { + "data": { + "dataset": ["susy", "hepmass"], + "split_kwargs": { "ignore": true }, + "preprocessing_kwargs": { "normalize": true } + } + }, + { + "data": { + "source": "make_blobs", + "generation_kwargs": [ + { "centers": 20, "cluster_std": 4.0, "n_samples": 20000000, "n_features": 10 }, + { "centers": 20, "cluster_std": 8.0, "n_samples": 2000000, "n_features": 100 }, + { "centers": 20, "cluster_std": 24.0, "n_samples": 500000, "n_features": 400 } + ], + "split_kwargs": { "ignore": true } + }, + "algorithm": { + "estimator_params": { + "n_clusters": 100 + } + } + }, + { + "data": { + "dataset": "cifar", + "split_kwargs": { "ignore": true } + } + } + ] + }, + "TEMPLATES": { + "sklearn kmeans": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common kmeans parameters", + "sklearn kmeans parameters", + "high-load kmeans parameters", + "high-load kmeans datasets" + ] + }, + "cuml kmeans": { + "SETS": [ + "cuml implementation", + "common kmeans parameters", + "cuml kmeans parameters", + "high-load kmeans parameters", + "high-load kmeans datasets" + ] + } + } +} diff --git a/configs/weekly/knn.json b/configs/weekly/knn.json new file mode 100644 index 00000000..dfc2864f --- /dev/null +++ b/configs/weekly/knn.json @@ -0,0 +1,104 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/knn.json"], + "PARAMETERS_SETS": { + "high-load brute knn classification datasets": { + "data": [ + { "dataset": "susy", "split_kwargs": { "train_size": 500000, "test_size": 500000 } } + ] + }, + "high-load kd_tree knn classification datasets": { + "data": [ + { + "source": "make_classification", + "generation_kwargs": [ + { + "n_classes": 5, "n_samples": 4000000, "n_features": 4, + "n_redundant": 0, "n_repeated": 0, "n_informative": 4 + }, + { + "n_classes": 5, "n_samples": 2000000, "n_features": 8, + "n_redundant": 2, "n_repeated": 2, "n_informative": 4 + }, + { + "n_classes": 5, "n_samples": 1000000, "n_features": 16, + "n_redundant": 6, "n_repeated": 6, "n_informative": 4 + } + ], + "split_kwargs": { "ignore": true } + } + ] + }, + "high-load brute knn regression datasets": { + "data": [ + { "dataset": "year_prediction_msd", "split_kwargs": { "ignore": true } } + ] + }, + "high-load kd_tree knn regression datasets": { + "data": [ + { + "source": "make_regression", + "generation_kwargs":[ + { "n_samples": 10000000, "n_features": 4, "noise": 1.0 }, + { "n_samples": 4000000, "n_features": 8, "noise": 1.5 }, + { "n_samples": 2000000, "n_features": 16, "noise": 2.0 } + ], + "split_kwargs": { "ignore": true } + } + ] + } + }, + "TEMPLATES": { + "sklearn brute knn clsf": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "brute knn classification parameters", + "high-load brute knn classification datasets" + ] + }, + "sklearn kd_tree knn clsf": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "kd_tree knn classification parameters", + "high-load kd_tree knn classification datasets" + ] + }, + "sklearn brute knn regr": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "brute knn regression parameters", + "high-load brute knn regression datasets" + ] + }, + "sklearn kd_tree knn regr": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common knn parameters", + "sklearn knn parameters", + "kd_tree knn regression parameters", + "high-load kd_tree knn regression datasets" + ] + }, + "cuml brute knn clsf": { + "SETS": [ + "cuml implementation", + "common knn parameters", + "brute knn classification parameters", + "high-load brute knn classification datasets" + ] + }, + "cuml brute knn regr": { + "SETS": [ + "cuml implementation", + "common knn parameters", + "brute knn regression parameters", + "high-load brute knn regression datasets" + ] + } + } +} diff --git a/configs/weekly/linear_model.json b/configs/weekly/linear_model.json new file mode 100644 index 00000000..9cb0bd58 --- /dev/null +++ b/configs/weekly/linear_model.json @@ -0,0 +1,105 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/linear_model.json"], + "PARAMETERS_SETS": { + "regression datasets": [ + { + "data": { + "source": "make_regression", + "split_kwargs": { "train_size": 0.5, "test_size": 0.5 }, + "generation_kwargs": [ + { + "n_samples": 20000000, "n_features": 50, + "n_informative": 5, "noise": 20.0 + }, + { + "n_samples": 2000000, "n_features": 400, + "n_informative": 5, "noise": 40.0 + }, + { + "n_samples": 200000, "n_features": 5000, + "n_informative": 5, "noise": 80.0 + } + ] + } + }, + { + "data": { + "dataset": [ + "epsilon", + "yolanda", + "hepmass", + "susy" + ], + "preprocessing_kwargs": { + "normalize": true + }, + "split_kwargs": { "ignore": true } + } + } + ] + }, + "TEMPLATES": { + "sklearn linear": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common linear parameters", + "sklearn linear parameters", + "regression datasets" + ] + }, + "sklearn ridge": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common ridge parameters", + "sklearn ridge parameters", + "regression datasets" + ] + }, + "sklearn lasso": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common lasso parameters", + "regression datasets" + ] + }, + "sklearn elasticnet": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "common elasticnet parameters", + "regression datasets" + ] + }, + "cuml linear": { + "SETS": [ + "cuml implementation", + "common linear parameters", + "cuml L2 parameters", + "regression datasets" + ] + }, + "cuml ridge": { + "SETS": [ + "cuml implementation", + "common ridge parameters", + "cuml L2 parameters", + "regression datasets" + ] + }, + "cuml lasso": { + "SETS": [ + "cuml implementation", + "common lasso parameters", + "cuml L1 parameters", + "regression datasets" + ] + }, + "cuml elasticnet": { + "SETS": [ + "cuml implementation", + "common elasticnet parameters", + "cuml L1 parameters", + "regression datasets" + ] + } + } +} diff --git a/configs/weekly/logreg.json b/configs/weekly/logreg.json new file mode 100644 index 00000000..d4576a4e --- /dev/null +++ b/configs/weekly/logreg.json @@ -0,0 +1,28 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/logreg.json", "../regular/logreg.json"], + "PARAMETERS_SETS": { + "high-load logreg datasets": [ + { "data": { "split_kwargs": { "ignore": true } } } + ] + }, + "TEMPLATES": { + "sklearn logreg": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common logreg parameters", + "sklearn logreg parameters", + "logreg datasets", + "high-load logreg datasets" + ] + }, + "cuml logreg": { + "SETS": [ + "cuml implementation", + "common logreg parameters", + "cuml logreg parameters", + "logreg datasets", + "high-load logreg datasets" + ] + } + } +} diff --git a/configs/weekly/pca.json b/configs/weekly/pca.json new file mode 100644 index 00000000..cef210d2 --- /dev/null +++ b/configs/weekly/pca.json @@ -0,0 +1,41 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/pca.json"], + "PARAMETERS_SETS": { + "high-load pca datasets": [ + { + "data": { + "source": "make_blobs", + "generation_kwargs": [ + { "n_samples": 10000000, "n_features": 200, "centers": 2 }, + { "n_samples": 5000000, "n_features": 1000, "centers": 2 }, + { "n_samples": 1000000, "n_features": 5000, "centers": 2 } + ], + "split_kwargs": { "ignore": true } + } + }, + { + "data": { + "dataset": ["airline_depdelay", "bosch", "epsilon"], + "split_kwargs": { "ignore": true } + } + } + ] + }, + "TEMPLATES": { + "sklearn pca": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "pca parameters", + "high-load pca datasets" + ] + }, + "cuml pca": { + "SETS": [ + "cuml implementation", + "pca parameters", + "cuml pca parameters", + "high-load pca datasets" + ] + } + } +} diff --git a/configs/weekly/svm.json b/configs/weekly/svm.json new file mode 100644 index 00000000..93f73961 --- /dev/null +++ b/configs/weekly/svm.json @@ -0,0 +1,144 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/svm.json"], + "PARAMETERS_SETS": { + "high-load svc binary datasets": [ + { + "data": { "dataset": "a9a", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": "linear" } } + }, + { + "data": { "dataset": "skin_segmentation", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "C": 10.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "ijcnn", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "epsilon", "split_kwargs": { "train_size": 100000, "test_size": 100000 } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } + } + ], + "high-load svc multiclass datasets": [ + { + "data": { "dataset": "connect", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "C": 10.0, "kernel": ["poly", "rbf"] } } + }, + { + "data": { + "dataset": "mnist", + "split_kwargs": { "ignore": true }, + "preprocessing_kwargs": { "normalize": false } + }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } } + } + ], + "high-load svr datasets": [ + { + "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 50000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "twodplanes", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } } + }, + { + "data": { + "source": "make_regression", + "generation_kwargs": { + "n_samples": 200000, + "n_features": 1000, + "n_informative": "[SPECIAL_VALUE]0.5" + }, + "split_kwargs": { "train_size": 0.5 } + }, + "algorithm": { "estimator_params": { "C": 0.1, "kernel": "linear" } } + } + ], + "high-load nusvc datasets": [ + { + "data": { "dataset": "a9a", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "nu": 0.1, "kernel": ["poly", "rbf"] } } + }, + { + "data": { "dataset": "codrnanorm", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "nu": 0.5, "kernel": "poly" } } + }, + { + "data": { "dataset": "ijcnn", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } } + } + ], + "high-load nusvr datasets": [ + { + "data": { "dataset": "year_prediction_msd", "split_kwargs": { "train_size": 50000, "test_size": null } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": "rbf" } } + }, + { + "data": { "dataset": "twodplanes", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } } + }, + { + "data": { "dataset": "fried", "split_kwargs": { "ignore": true } }, + "algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } } + }, + { + "data": { + "source": "make_regression", + "generation_kwargs": { + "n_samples": 100000, + "n_features": 1000, + "n_informative": "[SPECIAL_VALUE]0.5" + }, + "split_kwargs": { "train_size": 0.5 } + }, + "algorithm": { "estimator_params": { "nu": 0.5, "C": 0.1, "kernel": "linear" } } + } + ] + }, + "TEMPLATES": { + "svc binary": { + "SETS": [ + "binary svc implementations", + "common svm parameters", + "svm clsf parameters", + "svc parameters", + "high-load svc binary datasets" + ] + }, + "svc multiclass": { + "SETS": [ + "multi svc implementations", + "common svm parameters", + "svm clsf parameters", + "svc parameters", + "high-load svc multiclass datasets" + ] + }, + "svr": { + "SETS": [ + "svr implementations", + "common svm parameters", + "svr parameters", + "high-load svr datasets" + ] + }, + "nusvc": { + "SETS": [ + "nusvm implementations", + "common svm parameters", + "svm clsf parameters", + "nusvc parameters", + "high-load nusvc datasets" + ] + }, + "nusvr": { + "SETS": [ + "nusvm implementations", + "common svm parameters", + "nusvr parameters", + "high-load nusvr datasets" + ] + } + } +} diff --git a/configs/weekly/train_test_split.json b/configs/weekly/train_test_split.json new file mode 100644 index 00000000..5dbab5f4 --- /dev/null +++ b/configs/weekly/train_test_split.json @@ -0,0 +1,40 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/train_test_split.json"], + "PARAMETERS_SETS": { + "high-load train_test_split datasets": [ + { + "data": { + "dataset": [ + "airline_depdelay", + "higgs", + "bosch" + ] + } + }, + { + "data": { + "source": "make_regression", + "generation_kwargs": [ + { "n_samples": 200000000, "n_features": 5 } + ] + } + } + ] + }, + "TEMPLATES": { + "sklearn train_test_split": { + "SETS": [ + "sklearn-ex[cpu] implementations", + "train_test_split parameters", + "high-load train_test_split datasets" + ] + }, + "cuml train_test_split": { + "SETS": [ + "cuml implementation", + "train_test_split parameters", + "high-load train_test_split datasets" + ] + } + } +} diff --git a/configs/weekly/tsne.json b/configs/weekly/tsne.json new file mode 100644 index 00000000..e743ddea --- /dev/null +++ b/configs/weekly/tsne.json @@ -0,0 +1,43 @@ +{ + "INCLUDE": ["../common/sklearn.json", "../common/tsne.json"], + "PARAMETERS_SETS": { + "high-load tsne datasets": [ + { + "data": { + "dataset": ["medical_charges_nominal"], + "split_kwargs": { "ignore": true } + } + }, + { + "data": { + "dataset": "hepmass", + "split_kwargs": { "train_size": [100000, 200000, 500000] } + } + }, + { + "data": { + "dataset": ["sensit", "mnist", "cifar"], + "split_kwargs": { "ignore": true } + } + } + ] + }, + "TEMPLATES": { + "sklearn tsne": { + "SETS": [ + "sklearn-ex[cpu,gpu] implementations", + "common tsne parameters", + "sklearn parameters", + "high-load tsne datasets" + ] + }, + "cuml tsne": { + "SETS": [ + "cuml implementation", + "common tsne parameters", + "cuml parameters", + "high-load tsne datasets" + ] + } + } +}