kmeans |
Scala程序 |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.mllib.linalg.Vectors /** * Created by hui on 2017/11/21. * K-means算法 */ object kmeans { def main(args:Array[String]): Unit ={ val conf= new SparkConf().setAppName("kmeans").setMaster("local") val sc = new SparkContext(conf) val data=sc.textFile("data/mllib/kmeans_data.txt") val parsedData=data.map(s=>Vectors.dense(s.split(‘ ‘).map(_.toDouble))).cache() val numClusters=2 val numIterations=20 val clusters=KMeans.train(parsedData,numClusters,numIterations) val WSSSE=clusters.computeCost(parsedData) println("Within Set Sum of Squared Errors = " + WSSSE) clusters.save(sc,"my_kmeans") val sameModel=KMeansModel.load(sc,"my_kmeans") } } |
Java程序 |
import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.clustering.KMeans; import org.apache.spark.mllib.clustering.KMeansModel; import org.apache.spark.mllib.linalg.Vector; import org.apache.spark.mllib.linalg.Vectors; // $example off$ public class JavaKMeansExample { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaKMeansExample").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf); // $example on$ // Load and parse data String path = "data/mllib/kmeans_data.txt"; JavaRDD<String> data = jsc.textFile(path); JavaRDD<Vector> parsedData = data.map( new Function<String, Vector>() { public Vector call(String s) { String[] sarray = s.split(" "); double[] values = new double[sarray.length]; for (int i = 0; i < sarray.length; i++) { values[i] = Double.parseDouble(sarray[i]); } return Vectors.dense(values); } } ); parsedData.cache(); // Cluster the data into two classes using KMeans int numClusters = 2; int numIterations = 20; KMeansModel clusters = KMeans.train(parsedData.rdd(), numClusters, numIterations); System.out.println("Cluster centers:"); for (Vector center: clusters.clusterCenters()) { System.out.println(" " + center); } double cost = clusters.computeCost(parsedData.rdd()); System.out.println("Cost: " + cost); // Evaluate clustering by computing Within Set Sum of Squared Errors double WSSSE = clusters.computeCost(parsedData.rdd()); System.out.println("Within Set Sum of Squared Errors = " + WSSSE); // Save and load model clusters.save(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel"); KMeansModel sameModel = KMeansModel.load(jsc.sc(), "target/org/apache/spark/JavaKMeansExample/KMeansModel"); // $example off$ jsc.stop(); } } |
运行结果 |
decisiontree |
Scala程序 |
import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.mllib.tree.model.DecisionTreeModel import org.apache.spark.mllib.util.MLUtils import org.apache.spark.SparkConf import org.apache.spark.SparkContext /** * Created by hui on 2017/11/21. * 使用树深为5的决策树进行分类 */ object decisiontree { def main(args:Array[String]): Unit = { val conf = new SparkConf().setAppName("decisiontree").setMaster("local") val sc = new SparkContext(conf) val data = MLUtils.loadLibSVMFile(sc, "E:\\ideaProjects\\TestBook\\data\\mllib\\sample_libsvm_data.txt") val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val numClass = 2 val categoricalFeaturesInfo = Map[Int, Int]() val impurity = "gini" val maxDepth = 5 val maxBins = 32 val model = DecisionTree.trainClassifier(trainingData, numClass, categoricalFeaturesInfo, impurity, maxDepth, maxBins) val labelAndPreds = testData.map { point => val predicition = model.predict(point.features) (point.label, predicition) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error=" + testErr) println("Learn classification tree model:\n" + model.toDebugString) model.save(sc, "my_decisiontree") val sameModel = DecisionTreeModel.load(sc, "my_decisiontree") } } |
Java程序 |
import java.util.HashMap; import java.util.Map; import scala.Tuple2; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.tree.DecisionTree; import org.apache.spark.mllib.tree.model.DecisionTreeModel; import org.apache.spark.mllib.util.MLUtils; // $example off$ class JavaDecisionTreeClassificationExample { public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf().setAppName("JavaDecisionTreeClassificationExample").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Set parameters. // Empty categoricalFeaturesInfo indicates all features are continuous. Integer numClasses = 2; Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); String impurity = "gini"; Integer maxDepth = 5; Integer maxBins = 32; // Train a DecisionTree model for classification. final DecisionTreeModel model = DecisionTree.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, impurity, maxDepth, maxBins); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testErr = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }).count() / testData.count(); System.out.println("Test Error: " + testErr); System.out.println("Learned classification tree model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel"); DecisionTreeModel sameModel = DecisionTreeModel .load(jsc.sc(), "target/tmp/myDecisionTreeClassificationModel"); // $example off$ } } |
运行结果 |
randforest_classifier |
Scala程序 |
import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel /** * Created by hui on 2017/11/21. * 使用随机森林进行分类 */ object randforest_classifier { def main(args:Array[String]): Unit = { val conf = new SparkConf().setAppName("randforest_classifier").setMaster("local") val sc = new SparkContext(conf) val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val numClass = 2 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 3 val featureSubsetStrategy = "auto" val impurity = "gini" val maxDepth = 4 val maxBins = 32 val model = RandomForest.trainClassifier(trainingData, numClass, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) val labelAndPreds = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count() println("Test Error=" + testErr) println("Learned classification forest model:\n" + model.toDebugString) model.save(sc, "myModelPath") val sameModel = RandomForestModel.load(sc, "myModelPath") } } |
Java程序 |
import java.util.HashMap; import scala.Tuple2; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.tree.RandomForest; import org.apache.spark.mllib.tree.model.RandomForestModel; import org.apache.spark.mllib.util.MLUtils; // $example off$ public class JavaRandomForestClassificationExample { public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestClassificationExample").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Train a RandomForest model. // Empty categoricalFeaturesInfo indicates all features are continuous. Integer numClasses = 2; HashMap<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); Integer numTrees = 3; // Use more in practice. String featureSubsetStrategy = "auto"; // Let the algorithm choose. String impurity = "gini"; Integer maxDepth = 5; Integer maxBins = 32; Integer seed = 12345; final RandomForestModel model = RandomForest.trainClassifier(trainingData, numClasses, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testErr = 1.0 * predictionAndLabel.filter(new Function<Tuple2<Double, Double>, Boolean>() { @Override public Boolean call(Tuple2<Double, Double> pl) { return !pl._1().equals(pl._2()); } }).count() / testData.count(); System.out.println("Test Error: " + testErr); System.out.println("Learned classification forest model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myRandomForestClassificationModel"); RandomForestModel sameModel = RandomForestModel.load(jsc.sc(), "target/tmp/myRandomForestClassificationModel"); // $example off$ jsc.stop(); } } |
运行结果 |
randforest_regressor |
Scala程序 |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.tree.RandomForest import org.apache.spark.mllib.tree.model.RandomForestModel /** * Created by hui on 2017/11/21. * 使用随机森林进行回归 */ object randforest_regressor { def main(args:Array[String]): Unit = { val conf = new SparkConf().setAppName("randforest_regressor").setMaster("local") val sc = new SparkContext(conf) val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt") val splits = data.randomSplit(Array(0.7, 0.3)) val (trainingData, testData) = (splits(0), splits(1)) val numClass = 2 val categoricalFeaturesInfo = Map[Int, Int]() val numTrees = 3 val featureSubsetStrategy = "auto" val impurity = "variance" val maxDepth = 4 val maxBins = 32 val model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) val labelAndPredictions = testData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val testMSE = labelAndPredictions.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Test Mean Squared Error=" + testMSE) println("Learned regression forest model:\n" + model.toDebugString) model.save(sc, "myModelPath") val sameModel = RandomForestModel.load(sc, "myModelPath") } } |
Java程序 |
import java.util.HashMap; import java.util.Map; import scala.Tuple2; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.tree.RandomForest; import org.apache.spark.mllib.tree.model.RandomForestModel; import org.apache.spark.mllib.util.MLUtils; import org.apache.spark.SparkConf; // $example off$ public class JavaRandomForestRegressionExample { public static void main(String[] args) { // $example on$ SparkConf sparkConf = new SparkConf().setAppName("JavaRandomForestRegressionExample").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(sparkConf); // Load and parse the data file. String datapath = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), datapath).toJavaRDD(); // Split the data into training and test sets (30% held out for testing) JavaRDD<LabeledPoint>[] splits = data.randomSplit(new double[]{0.7, 0.3}); JavaRDD<LabeledPoint> trainingData = splits[0]; JavaRDD<LabeledPoint> testData = splits[1]; // Set parameters. // Empty categoricalFeaturesInfo indicates all features are continuous. Map<Integer, Integer> categoricalFeaturesInfo = new HashMap<>(); Integer numTrees = 3; // Use more in practice. String featureSubsetStrategy = "auto"; // Let the algorithm choose. String impurity = "variance"; Integer maxDepth = 4; Integer maxBins = 32; Integer seed = 12345; // Train a RandomForest model. final RandomForestModel model = RandomForest.trainRegressor(trainingData, categoricalFeaturesInfo, numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins, seed); // Evaluate model on test instances and compute test error JavaPairRDD<Double, Double> predictionAndLabel = testData.mapToPair(new PairFunction<LabeledPoint, Double, Double>() { @Override public Tuple2<Double, Double> call(LabeledPoint p) { return new Tuple2<>(model.predict(p.features()), p.label()); } }); Double testMSE = predictionAndLabel.map(new Function<Tuple2<Double, Double>, Double>() { @Override public Double call(Tuple2<Double, Double> pl) { Double diff = pl._1() - pl._2(); return diff * diff; } }).reduce(new Function2<Double, Double, Double>() { @Override public Double call(Double a, Double b) { return a + b; } }) / testData.count(); System.out.println("Test Mean Squared Error: " + testMSE); System.out.println("Learned regression forest model:\n" + model.toDebugString()); // Save and load model model.save(jsc.sc(), "target/tmp/myRandomForestRegressionModel"); RandomForestModel sameModel = RandomForestModel.load(jsc.sc(), "target/tmp/myRandomForestRegressionModel"); // $example off$ jsc.stop(); } } |
运行结果 |
svm |
Scala程序 |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.util.MLUtils /** * Created by hui on 2017/11/21. * 支持向量机分类 */ object svm { def main(args:Array[String]): Unit = { val conf = new SparkConf().setAppName("svm").setMaster("local") val sc = new SparkContext(conf) val data=MLUtils.loadLibSVMFile(sc,"data/mllib/sample_libsvm_data.txt") val splits=data.randomSplit(Array(0.6,0.4),seed=11L) val training=splits(0).cache() val test=splits(1) val numIterations=100 val model=SVMWithSGD.train(training,numIterations) model.clearThreshold() val scoreAndLabels=test.map{point=> val score=model.predict(point.features) (score,point.label) } val metrics=new BinaryClassificationMetrics(scoreAndLabels) val auROC=metrics.areaUnderROC() println("Area under ROC="+ auROC) model.save(sc,"my_svm") val sameModel=SVMModel.load(sc,"my_svm") } } |
Java程序 |
import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; // $example on$ import scala.Tuple2; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.classification.SVMModel; import org.apache.spark.mllib.classification.SVMWithSGD; import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics; import org.apache.spark.mllib.regression.LabeledPoint; import org.apache.spark.mllib.util.MLUtils; // $example off$ /** * Example for SVMWithSGD. */ public class JavaSVMWithSGDExample { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("JavaSVMWithSGDExample").setMaster("local"); SparkContext sc = new SparkContext(conf); // $example on$ String path = "data/mllib/sample_libsvm_data.txt"; JavaRDD<LabeledPoint> data = MLUtils.loadLibSVMFile(sc, path).toJavaRDD(); // Split initial RDD into two... [60% training data, 40% testing data]. JavaRDD<LabeledPoint> training = data.sample(false, 0.6, 11L); training.cache(); JavaRDD<LabeledPoint> test = data.subtract(training); // Run training algorithm to build the model. int numIterations = 100; final SVMModel model = SVMWithSGD.train(training.rdd(), numIterations); // Clear the default threshold. model.clearThreshold(); // Compute raw scores on the test set. JavaRDD<Tuple2<Object, Object>> scoreAndLabels = test.map( new Function<LabeledPoint, Tuple2<Object, Object>>() { public Tuple2<Object, Object> call(LabeledPoint p) { Double score = model.predict(p.features()); return new Tuple2<Object, Object>(score, p.label()); } } ); // Get evaluation metrics. BinaryClassificationMetrics metrics = new BinaryClassificationMetrics(JavaRDD.toRDD(scoreAndLabels)); double auROC = metrics.areaUnderROC(); System.out.println("Area under ROC = " + auROC); // Save and load model model.save(sc, "target/tmp/javaSVMWithSGDModel"); SVMModel sameModel = SVMModel.load(sc, "target/tmp/javaSVMWithSGDModel"); // $example off$ sc.stop(); } } |
运行结果 |
Spark常用机器学习算法(scala+java)
时间: 2024-10-14 10:23:10
Spark常用机器学习算法(scala+java)的相关文章
Spark MLlib机器学习算法、源码及实战讲解pdf电子版下载
Spark MLlib机器学习算法.源码及实战讲解pdf电子版下载 链接:https://pan.baidu.com/s/1ruX9inG5ttOe_5lhpK_LQg 提取码:idcb <Spark MLlib机器学习:算法.源码及实战详解>书中讲解由浅入深慢慢深入,解析讲解了MLlib的底层原理:数据操作及矩阵向量计算操作,该部分是MLlib实现的基础:并对此延伸机器学习的算法,循序渐进的讲解其中的原理,是读者一点一点的理解和掌握书中的知识. 目录 · · · · · · 第一部分 Spa
常用机器学习算法
一般说来,机器学习有三种算法:1. 监督式学习监督式学习算法包括一个目标变量(因变量)和用来预测目标变量的预测变量(自变量).通过这些变量我们可以搭建一个模型,从而对于一个已知的预测变量值,我们可以得到对应的目标变量值.重复训练这个模型,直到它能在训练数据集上达到预定的准确度. 属于监督式学习的算法有:回归模型,决策树,随机森林,K邻近算法,逻辑回归等. 2. 无监督式学习与监督式学习不同的是,无监督学习中我们没有需要预测或估计的目标变量.无监督式学习是用来对总体对象进行分类的.它在根据某一指标
spark Using MLLib in Scala/Java/Python
Using MLLib in ScalaFollowing code snippets can be executed in spark-shell. Binary ClassificationThe following code snippet illustrates how to load a sample dataset, execute a training algorithm on this training data using a static method in the algo
常用排序算法总结---Java实现
各个排序总结,以及时间,空间复杂度分析 一.冒泡排序: /* 冒泡排序: 排序算法思想:进行n-1趟排序,每趟,相邻元素,两两相互比较,将其中如果前一个元素比后一个元素小 则令其交换.(最后的结果是,小的往后移(从大到小的冒泡)) */ class BubleSort { public static void main(String[] args) { int[] arr = {1,4,6,3,7,4,9,8}; bSort_1(arr); for (int a: arr) { System.o
Spark垃圾邮件分类(scala+java)
Java程序 import java.util.Arrays; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.mllib.classifi
常用排序算法的Java实现 - 1
学习编程语言时, 我们会接触到许多排序算法, 这里总结了一下常见的排序算法. 不定期更新. * 其实在Java中存在如Collections.sort()这样的方法来自动为我们排序, 不过学习排序算法能让我们更好的理解一门编程语言. 1. 冒泡排序: 该算法是最基本也是最有名的算法之一了. 1 public static void bubbleSort(int[] arr) { 2 for(int i = 0; i < arr.length - 1; i++) { 3 for(j = 0; j
常用排序算法及Java实现
排序算法,可以分为内部排序和外部排序两大种.这篇文章主要对内部排序进行介绍.内部排序又分为两类,基于比较的非线性时间类,和非比较的线性时间类.前一类又可以分为四种,交换排序(包括冒泡排序和快速排序),插入排序(包括简单插入排序和希尔排序),选择排序(包括简单选择排序和堆排序)以及归并排序:后者主要包含三种,计数排序,桶排序和基数排序. 总体来说,快排.堆排和归并排序是非线性时间中最快的三种.一般认为,快排的时间效率会比堆排更好.另一方面,快排和堆排都是不稳定的算法,只有归并排序.冒泡排序和插入排
插入排序,选择排序,冒泡排序等常用排序算法(java实现)
package org.webdriver.autotest.Study; import java.util.*; public class sort_examp{ public static void main(String[] args) { Integer[] num={49,38,65,97,76,13,27,49,78,34,12,64,1}; //insertsort(num); //selectsort(num); maopaosort(num);
机器学习(十一)—常用机器学习算法优缺点对比、适用条件
1.决策树 适用条件:数据不同类边界是非线性的,并且通过不断将特征空间切分为矩阵来模拟.特征之间有一定的相关性.特征取值的数目应该差不多,因为信息增益偏向于更多数值的特征. 优点:1.直观的决策规则:2.可以处理非线性特征:3.考虑了变量之间的相互作用. 缺点:1.容易过拟合(随机森林.剪枝):2.处理缺失数据时的困难:3.数据集中属性间的相关性. 2.SVM 适用条件:特征空间大,可以处理非线性的特征. 优点:1.可以处理小样本情况下的机器学习问题:2.可以处理高维特征:3.使用核函数应对非线