Newsletter zu Aktionen
Trage dich ein um keine Aktionen von uns zu verpassen.
Wir senden 1-2 E-Mails pro Quartal.
Click HERE to download A3 Poster
SparkSQL
SELECT * FROM table
SELECT column1, column2 FROM table
SELECT COUNT(*) FROM table
SELECT DISTINCT column FROM table
SELECT * FROM table WHERE column=value
SELECT * FROM table WHERE column>value
SELECT * FROM table ORDER BY column ASC
SELECT * FROM table ORDER BY column DESC
PySpark
df.select("*")
df.select("column1", "column2")
df.count()
df.select("column").distinct()
df.filter(df.column == value)
df.filter(df.column > value)
df.orderBy("column", ascending=True)
df.orderBy("column", ascending=False)
SparkSQL
SELECT COUNT(column) FROM table
SELECT MAX(column) FROM table
SELECT MIN(column) FROM table
SELECT AVG(column) FROM table
SELECT SUM(column) FROM table
SELECT column, COUNT(*) FROM table GROUP BY column
SELECT column, SUM(column2) FROM table GROUP BY column
SELECT column, AVG(column2) FROM table GROUP BY column
SELECT column, MIN(column2) FROM table GROUP BY column
PySpark
df.agg(F.count("column").alias("count"))
df.agg(F.max("column").alias("max"))
df.agg(F.min("column").alias("min"))
df.agg(F.avg("column").alias("avg"))
df.agg(F.sum("column").alias("sum"))
df.groupBy("column").agg(F.count("*").alias("count"))
df.groupBy("column").agg(F.sum("column2").alias("sum"))
df.groupBy("column").agg(F.avg("column2").alias("avg"))
df.groupBy("column").agg(F.min("column2").alias("min"))
SparkSQL
SELECT * FROM table1 JOIN table2 ON condition
SELECT * FROM table1 LEFT JOIN table2 ON condition
SELECT * FROM table1 RIGHT JOIN table2 ON condition
SELECT * FROM table1 FULL JOIN table2 ON condition
PySpark
df1.join(df2, on="condition", how="inner")
df1.join(df2, on="condition", how="left")
df1.join(df2, on="condition", how="right")
df1.join(df2, on="condition", how="outer")
SparkSQL
SELECT * FROM table WHERE column IS NULL
SELECT * FROM table WHERE column IS NOT NULL
SELECT column, COALESCE(column, 0) FROM table
SELECT column, IFNULL(column, 0) FROM table
PySpark
df.filter(df.column.isNull())
df.filter(df.column.isNotNull())
df.withColumn("column", F.coalesce(df.column, F.lit(0)))
"df.withColumn(""column"", F.when(df.column.isNull(), 0)
.otherwise(df.column))"
SparkSQL
SELECT UPPER(column) FROM table
SELECT LOWER(column) FROM table
SELECT TRIM(column) FROM table
SELECT SUBSTRING(column, 1, 3) FROM table
SELECT LENGTH(column) FROM table
PySpark
df.withColumn("column", F.upper(df.column))
df.withColumn("column", F.lower(df.column))
df.withColumn("column", F.trim(df.column))
df.withColumn("column", F.substring(df.column, 1, 3))
df.withColumn("columnLength", F.length(df.column))
SparkSQL
SELECT CURRENT_DATE()
SELECT CURRENT_TIMESTAMP()
SELECT YEAR(column) FROM table
SELECT MONTH(column) FROM table
SELECT DAY(column) FROM table
SELECT DATEDIFF(end_date, start_date) FROM table
PySpark
df.withColumn("current_date", F.current_date())
df.withColumn("current_timestamp", F.current_timestamp())
df.withColumn("year", F.year(df.column))
df.withColumn("month", F.month(df.column))
df.withColumn("day", F.dayofmonth(df.column))
df.withColumn("date_diff", F.datediff(df.end_date, df.start_date))
SparkSQL
LOAD DATA INPATH '/path' INTO TABLE table
INSERT INTO table VALUES (1, 2)
SELECT * FROM table
SAVE AS TABLE table
PySpark
df = spark.read.csv("/path")
df.write.insertInto("table", overwrite=True)
df = spark.table("table")
df.write.saveAsTable("table", mode="overwrite")
Trage dich ein um keine Aktionen von uns zu verpassen.
Wir senden 1-2 E-Mails pro Quartal.