sudo docker run -it --rm -p 8088:8888 jupyter/all-spark-notebook
test case
在jupyter notebook 中使用案例。
使用spark-shell 读取文件
$ spark-shell scala> val words = sc.textFile("file:///usr/local/spark-3.5.0-bin-hadoop3/NOTICE") words: org.apache.spark.rdd.RDD[String] = file:///usr/local/spark-3.5.0-bin-hadoop3/NOTICE MapPartitionsRDD[7] at textFile at <console>:23
scala> val rddtest = words.cache rddtest: words.type = file:///usr/local/spark-3.5.0-bin-hadoop3/NOTICE MapPartitionsRDD[7] at textFile at <console>:23
scala> val fk = rddtest.first fk: String = Apache Spark
spark-shell > import org.apache.spark._ import org.apache.spark.streaming._ import org.apache.spark.streaming.StreamingContext._ // not necessary since Spark 1.3
// Create a local StreamingContext with two working thread and batch interval of 1 second. // The master requires 2 cores to prevent a starvation scenario.
val ssc = new StreamingContext(sc, Seconds(1))
// Create a DStream that will connect to hostname:port, like localhost:9999 val lines = ssc.socketTextStream("localhost", 9999)
// Split each line into words val words = lines.flatMap(_.split(" "))
val pairs = words.map(word => (word, 1)) val wordCounts = pairs.reduceByKey(_ + _)
wordCounts.print() ssc.start() // Start the computation