--

Spark 基础练习



1. 创建一个 Spark SQLContext

  from pyspark.sql import SQLContext
  sqlContext = SQLContext(sc)

我们可以通过一个已存在的 RDD、Hive table、data source 来创建 DataFrame。

2. 读入数据

  df = sqlContext.read.json('./filename.json')
  df.show()


Spark MLlib



  from pyspark.mllib.stat import Stastics
  dataMatrix f= sc.parallelize([[1,2,3], [4,5,6], [7,8,9], [10,11,12]])

  summary = Statics.colStats(dataMatrix)
  print(summary.mean())
  print(summary.variance())
  print(summary.numNonzeros())


  from pyspark.mllib.tree import DecisionTree, DdecisionTreeModel
  from pyspark.mllib.util import MLUtils

  data = sc.textFile("data.txt")
  model = DecisionTree.trainClassifier(parsedData, numClasses = 2)
  print(model.toDebugStrinh())
  model.save(sc, 'decisionTreeModel')


  from pyspark.mllib.clustering import KMeans, KMeansModel
  from numpy import array

  data = sc.textFile("data.txt")
  parsedData = data.map(lambda line: array([float(x) for x in line.splie()]))

  clusters = KMeans.train(parsedData, k = 3)
  print(clusters.centers)