Spark
在 pyspark 命令行下可以:
>>> textFile = sc.textFile("file:///usr/local/spark/mycode/wordcount/word.txt")
>>> wordCount = textFile.flatMap(lambda line: line.split(" ")).map(lambda word: (word,1)).reduceByKey(lambda a, b : a + b)
>>> wordCount.collect()
写成 python 脚本:
from pyspark import SparkContext
sc = SparkContext( 'local', 'test')
textFile = sc.textFile("file:///usr/local/spark/mycode/wordcount/word.txt")
wordCount = textFile.flatMap(lambda line: line.split(" ")).map(lambda word: (word,1)).reduceByKey(lambda a, b : a + b)
wordCount.foreach(print)
MapReduce
class WordCount:
# @param {str} line a text, for example "Bye Bye see you next"
def mapper(self, _, line):
# Please use 'yield key, value'
for word in line.split():
yield word, 1
# @param key is from mapper
# @param values is a set of value with the same key
def reducer(self, key, values):
# Please use 'yield key, value'
yield key, sum(values)