...

Cheat Sheet: SparkSQL to PySpark

Click HERE to download A3 Poster

1. Basic Operations

SparkSQL

SELECT * FROM table SELECT column1, column2 FROM table SELECT COUNT(*) FROM table SELECT DISTINCT column FROM table SELECT * FROM table WHERE column=value SELECT * FROM table WHERE column>value SELECT * FROM table ORDER BY column ASC SELECT * FROM table ORDER BY column DESC

PySpark

df.select("*") df.select("column1", "column2") df.count() df.select("column").distinct() df.filter(df.column == value) df.filter(df.column > value) df.orderBy("column", ascending=True) df.orderBy("column", ascending=False)

2. Aggregations

SparkSQL

SELECT COUNT(column) FROM table SELECT MAX(column) FROM table SELECT MIN(column) FROM table SELECT AVG(column) FROM table SELECT SUM(column) FROM table SELECT column, COUNT(*) FROM table GROUP BY column SELECT column, SUM(column2) FROM table GROUP BY column SELECT column, AVG(column2) FROM table GROUP BY column SELECT column, MIN(column2) FROM table GROUP BY column

PySpark

df.agg(F.count("column").alias("count")) df.agg(F.max("column").alias("max")) df.agg(F.min("column").alias("min")) df.agg(F.avg("column").alias("avg")) df.agg(F.sum("column").alias("sum")) df.groupBy("column").agg(F.count("*").alias("count")) df.groupBy("column").agg(F.sum("column2").alias("sum")) df.groupBy("column").agg(F.avg("column2").alias("avg")) df.groupBy("column").agg(F.min("column2").alias("min"))

3. Joins

SparkSQL

SELECT * FROM table1 JOIN table2 ON condition SELECT * FROM table1 LEFT JOIN table2 ON condition SELECT * FROM table1 RIGHT JOIN table2 ON condition SELECT * FROM table1 FULL JOIN table2 ON condition

PySpark

df1.join(df2, on="condition", how="inner") df1.join(df2, on="condition", how="left") df1.join(df2, on="condition", how="right") df1.join(df2, on="condition", how="outer")

4. Handling NULLs

SparkSQL

SELECT * FROM table WHERE column IS NULL SELECT * FROM table WHERE column IS NOT NULL SELECT column, COALESCE(column, 0) FROM table SELECT column, IFNULL(column, 0) FROM table

PySpark

df.filter(df.column.isNull()) df.filter(df.column.isNotNull()) df.withColumn("column", F.coalesce(df.column, F.lit(0))) "df.withColumn(""column"", F.when(df.column.isNull(), 0) .otherwise(df.column))"

5. String Functions

SparkSQL

SELECT UPPER(column) FROM table SELECT LOWER(column) FROM table SELECT TRIM(column) FROM table SELECT SUBSTRING(column, 1, 3) FROM table SELECT LENGTH(column) FROM table

PySpark

df.withColumn("column", F.upper(df.column)) df.withColumn("column", F.lower(df.column)) df.withColumn("column", F.trim(df.column)) df.withColumn("column", F.substring(df.column, 1, 3)) df.withColumn("columnLength", F.length(df.column))

6. Date Functions

SparkSQL

SELECT CURRENT_DATE() SELECT CURRENT_TIMESTAMP() SELECT YEAR(column) FROM table SELECT MONTH(column) FROM table SELECT DAY(column) FROM table SELECT DATEDIFF(end_date, start_date) FROM table

PySpark

df.withColumn("current_date", F.current_date()) df.withColumn("current_timestamp", F.current_timestamp()) df.withColumn("year", F.year(df.column)) df.withColumn("month", F.month(df.column)) df.withColumn("day", F.dayofmonth(df.column)) df.withColumn("date_diff", F.datediff(df.end_date, df.start_date))

7. File Operations

SparkSQL

LOAD DATA INPATH '/path' INTO TABLE table INSERT INTO table VALUES (1, 2) SELECT * FROM table SAVE AS TABLE table

PySpark

df = spark.read.csv("/path") df.write.insertInto("table", overwrite=True) df = spark.table("table") df.write.saveAsTable("table", mode="overwrite")

Authors

  • Cheat Sheet

Authors

  • Authors

  • Newsletter zu Aktionen

    Trage dich ein um keine Aktionen von uns zu verpassen.
    Wir senden 1-2 E-Mails pro Quartal.