There is nothing like having a way to make it work to find more ways.
I found a .cast() method for the columns I want to use as numeric value and this avoids using a UDF to transform it.
I now prefer this way… until I find another, simpler…
package com.cinq.experience; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; import java.io.UnsupportedEncodingException; public class Session { public static void main(String[] args) throws UnsupportedEncodingException { SparkConf conf = new SparkConf().setAppName("SparkExperience").setMaster("local"); JavaSparkContext jsc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(jsc); DataFrame df = sqlContext.read() .format("com.databricks.spark.csv") .option("header", "true") .load("session.csv") .cache(); DataFrame crazy = df.select(df.col("x-custom-a"), df.col("x-custom-count").cast(DataTypes.LongType)); crazy.groupBy(crazy.col("x-custom-a")).avg("CAST(x-custom-count, LongType)").show(); } }