Cheat Sheet: SparkSQL to PySpark

Click HERE to download A3 Poster

1. Basic Operations

SparkSQL

SELECT * FROM table
SELECT column1, column2 FROM table
SELECT COUNT(*) FROM table
SELECT DISTINCT column FROM table
SELECT * FROM table WHERE column=value
SELECT * FROM table WHERE column>value
SELECT * FROM table ORDER BY column ASC
SELECT * FROM table ORDER BY column DESC

PySpark

df.select("*")
df.select("column1", "column2")
df.count()
df.select("column").distinct()
df.filter(df.column == value)
df.filter(df.column > value)
df.orderBy("column", ascending=True)
df.orderBy("column", ascending=False)

2. Aggregations

SparkSQL

SELECT COUNT(column) FROM table
SELECT MAX(column) FROM table
SELECT MIN(column) FROM table
SELECT AVG(column) FROM table
SELECT SUM(column) FROM table
SELECT column, COUNT(*) FROM table GROUP BY column
SELECT column, SUM(column2) FROM table GROUP BY column
SELECT column, AVG(column2) FROM table GROUP BY column
SELECT column, MIN(column2) FROM table GROUP BY column

PySpark

df.agg(F.count("column").alias("count"))
df.agg(F.max("column").alias("max"))
df.agg(F.min("column").alias("min"))
df.agg(F.avg("column").alias("avg"))
df.agg(F.sum("column").alias("sum"))
df.groupBy("column").agg(F.count("*").alias("count"))
df.groupBy("column").agg(F.sum("column2").alias("sum"))
df.groupBy("column").agg(F.avg("column2").alias("avg"))
df.groupBy("column").agg(F.min("column2").alias("min"))

3. Joins

SparkSQL

SELECT * FROM table1 JOIN table2 ON condition
SELECT * FROM table1 LEFT JOIN table2 ON condition
SELECT * FROM table1 RIGHT JOIN table2 ON condition
SELECT * FROM table1 FULL JOIN table2 ON condition

PySpark

df1.join(df2, on="condition", how="inner")
df1.join(df2, on="condition", how="left")
df1.join(df2, on="condition", how="right")
df1.join(df2, on="condition", how="outer")

4. Handling NULLs

SparkSQL

SELECT * FROM table WHERE column IS NULL
SELECT * FROM table WHERE column IS NOT NULL
SELECT column, COALESCE(column, 0) FROM table
SELECT column, IFNULL(column, 0) FROM table

PySpark

df.filter(df.column.isNull())
df.filter(df.column.isNotNull())
df.withColumn("column", F.coalesce(df.column, F.lit(0)))
"df.withColumn(""column"", F.when(df.column.isNull(), 0)
.otherwise(df.column))"

5. String Functions

SparkSQL

SELECT UPPER(column) FROM table
SELECT LOWER(column) FROM table
SELECT TRIM(column) FROM table
SELECT SUBSTRING(column, 1, 3) FROM table
SELECT LENGTH(column) FROM table

PySpark

df.withColumn("column", F.upper(df.column))
df.withColumn("column", F.lower(df.column))
df.withColumn("column", F.trim(df.column))
df.withColumn("column", F.substring(df.column, 1, 3))
df.withColumn("columnLength", F.length(df.column))

6. Date Functions

SparkSQL

SELECT CURRENT_DATE()
SELECT CURRENT_TIMESTAMP()
SELECT YEAR(column) FROM table
SELECT MONTH(column) FROM table
SELECT DAY(column) FROM table
SELECT DATEDIFF(end_date, start_date) FROM table

PySpark

df.withColumn("current_date", F.current_date())
df.withColumn("current_timestamp", F.current_timestamp())
df.withColumn("year", F.year(df.column))
df.withColumn("month", F.month(df.column))
df.withColumn("day", F.dayofmonth(df.column))
df.withColumn("date_diff", F.datediff(df.end_date, df.start_date))

7. File Operations

SparkSQL

LOAD DATA INPATH '/path' INTO TABLE table
INSERT INTO table VALUES (1, 2)
SELECT * FROM table
SAVE AS TABLE table

PySpark

df = spark.read.csv("/path")
df.write.insertInto("table", overwrite=True)
df = spark.table("table")
df.write.saveAsTable("table", mode="overwrite")

Authors

  • Cheat Sheet

Authors

  • Authors

  • Newsletter zu Aktionen

    Trage dich ein um keine Aktionen von uns zu verpassen.
    Wir senden 1-2 E-Mails pro Quartal.