Apache Spark DataFrame Average

We had some trouble doing the math on a column with dataframes even if the method is readily there.

We kept getting an error that the column was not a numeric value.

After a bit of reading I figured that I needed to use a UDF to transform the string column to a numeric column.

package com.cinq.experience;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.api.java.UDF1;
import org.apache.spark.sql.types.DataTypes;


public class DataFrameAvg {

	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("DataFrameAvg").setMaster("local");
		JavaSparkContext jsc = new JavaSparkContext(conf);
		SQLContext sqlContext = new SQLContext(jsc);

		DataFrame df = sqlContext.read()
				.format("com.databricks.spark.csv")
				.option("header", "true")
				.load("numericdata.csv");

		df.registerTempTable("allData");
		df.show();
		sqlContext.udf().register("toInt", new UDF1() {
			public Integer call(String s) throws Exception {
				System.out.println("Parsing: " + s);
				return Integer.parseInt(s);
			}
		}, DataTypes.IntegerType);

		DataFrame withNumber = sqlContext.sql("SELECT toInt(number) from allData");
		withNumber.groupBy().avg("c0").show();
	}
}

and the content of the numericdata.csv is very simple:

name,number
person1,53
person2,42
person3,27
person4,15
person5,24
person6,30
person7,33
person8,36
Advertisements

Published by

m5c

Java developper that loves photography and good coffee

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out /  Change )

Google+ photo

You are commenting using your Google+ account. Log Out /  Change )

Twitter picture

You are commenting using your Twitter account. Log Out /  Change )

Facebook photo

You are commenting using your Facebook account. Log Out /  Change )

Connecting to %s