RDD = sc.parallelize([ (1,"paul"),(2,"anne"), (1,"emile"),(2,"marie"),(1,"victor") ]) print RDD.reduceByKey(lambda a,b: a+"-"+b).collect()