This recipe is based on the Databricks spark-avro library and all rights to this library are owned by Databricks.

Download the spark-avro library:

$cd spark-avro-master
$cd target/scala-2.10

Now start spark-shell, loading the spark-avro library:

$spark-shell --jars spark-avro_2.10-0.1.jar
scala> val sqlContext = new org.apache.spark.sql.SQLContext(sc)
scala> import sqlContext._
scala> import com.databricks.spark.avro._
scala>val ufos = sqlContext.avroFile("hdfs://localhost:9000/user/hduser/ufodata/ufo_awesome.avro")
scala>sql("select count(*) from ufos").collect.foreach(println)
scala>sql("select distinct location from ufos").collect.foreach(println)

Now ufos being SchemaRDD, you know how to query for information.