1. 创建一个 Spark SQLContext
from pyspark.sql import SQLContext sqlContext = SQLContext(sc)
df = sqlContext.read.json('./filename.json') df.show()
from pyspark.mllib.stat import Stastics dataMatrix f= sc.parallelize([[1,2,3], [4,5,6], [7,8,9], [10,11,12]]) summary = Statics.colStats(dataMatrix) print(summary.mean()) print(summary.variance()) print(summary.numNonzeros())
from pyspark.mllib.tree import DecisionTree, DdecisionTreeModel from pyspark.mllib.util import MLUtils data = sc.textFile("data.txt") model = DecisionTree.trainClassifier(parsedData, numClasses = 2) print(model.toDebugStrinh()) model.save(sc, 'decisionTreeModel')
from pyspark.mllib.clustering import KMeans, KMeansModel from numpy import array data = sc.textFile("data.txt") parsedData = data.map(lambda line: array([float(x) for x in line.splie()])) clusters = KMeans.train(parsedData, k = 3) print(clusters.centers)