PySpark SQL
data = [("John","Cena","33","M",50000),
("Michael","Corneol","44","M",44000),
("Robert","Duvval","42","M",67000),
("Anne","Adams","92","F",34000),
("Peter","Parker","57","M",100000),
("Peter", "Hains", "47", "M", 500000),
("Penelope", 'Cruze', "44", "F", 1000000)
]
columns=["fname","lname","age","gender", "sal"]
df=spark.createDataFrame(data,columns)
# df = spark.createDataFrame(data=simpleData, schema = schema)
# USing SQL
df.createOrReplaceTempView("EMP")
spark.sql("select * from EMP ORDER BY sal asc") \
.show(truncate=False)
O/p:
+--------+-------+---+------+-------+
|fname |lname |age|gender|sal |
+--------+-------+---+------+-------+
|Anne |Adams |92 |F |34000 |
|Michael |Corneol|44 |M |44000 |
|John |Cena |33 |M |50000 |
|Robert |Duvval |42 |M |67000 |
|Peter |Parker |57 |M |100000 |
|Peter |Hains |47 |M |500000 |
|Penelope|Cruze |44 |F |1000000|
+--------+-------+---+------+-------+
# avg (average) Aggregate Function
print("avg: " + str(df.select(avg("sal")).collect()[0][0]))
O/p:
avg: 256428.57142857142
No comments:
Post a Comment