Search This Blog

PySpark SQL

PySpark SQL

data = [("John","Cena","33","M",50000),
    ("Michael","Corneol","44","M",44000),
    ("Robert","Duvval","42","M",67000),
    ("Anne","Adams","92","F",34000),
    ("Peter","Parker","57","M",100000),
    ("Peter", "Hains", "47", "M", 500000),
    ("Penelope", 'Cruze', "44", "F", 1000000)
  ]
columns=["fname","lname","age","gender", "sal"]
df=spark.createDataFrame(data,columns)
# df = spark.createDataFrame(data=simpleData, schema = schema)

# USing SQL
df.createOrReplaceTempView("EMP")
spark.sql("select * from EMP ORDER BY sal asc") \
.show(truncate=False)
 O/p:
+--------+-------+---+------+-------+ |fname |lname |age|gender|sal | +--------+-------+---+------+-------+ |Anne |Adams |92 |F |34000 | |Michael |Corneol|44 |M |44000 | |John |Cena |33 |M |50000 | |Robert |Duvval |42 |M |67000 | |Peter |Parker |57 |M |100000 | |Peter |Hains |47 |M |500000 | |Penelope|Cruze |44 |F |1000000| +--------+-------+---+------+-------+


# avg (average) Aggregate Function
print("avg: " + str(df.select(avg("sal")).collect()[0][0]))
O/p:
avg: 256428.57142857142








No comments:

Post a Comment