Problem: Could you please explain how to get a count of non null and non nan values of all columns, selected columns from DataFrame with Python examples?
Solution: In order to find non-null values of PySpark DataFrame columns, we need to use negate of isNotNull()
function for example ~df.name.isNotNull()
similarly for non-nan values ~isnan(df.name)
.
Note: In Python None
is equal to null
value, son on PySpark DataFrame None
values are shown as null
Let’s create a DataFrame with some null values.
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [(None,"CA"),("Julia",""),("Ram",None),("Ramya","NULL")]
df =spark.createDataFrame(data,["name","state"])
df.show()
+-----+-----+
| name|state|
+-----+-----+
| null| CA|
|Julia| |
| Ram| null|
|Ramya| NULL|
+-----+-----+
PySpark Count of non null values
#Find count of non null value of a column
from pyspark.sql.functions import col
print(df.filter(col("name").isNotNull()).count())
#3
Count of non null values of all DataFrame Columns
#Count of non-null for all columns
from pyspark.sql.functions import col, when, count
df.select([count(when(col(c).isNotNull() , c)).alias(c) for c in df.columns]
).show()
+----+-----+
|name|state|
+----+-----+
| 3| 3|
+----+-----+
Count of non null values of selected columns
#Count of non-null for selected columns
from pyspark.sql.functions import col, when, count
df_Columns=["name","state"]
df.select([count(when(col(c).isNotNull(), c)).alias(c) for c in df_Columns]
).show()
Let’s see how to ignore NULL
literal string value.
#Count of non-null by ignoring null literal values
from pyspark.sql.functions import col, when, count
df2 = df.select([count(when(~col(c).contains('None') & \
~col(c).contains('NULL') & \
(col(c) != '' ) & \
~col(c).isNull(), c
)).alias(c)
for c in df.columns])
df2.show()
+----+-----+
|name|state|
+----+-----+
| 3| 1|
+----+-----+
PySpark Count of non NaN Values of DataFrame column
Below example demonstrates how to get a count of non Nan Values of a PySpark DataFrame column.
#Find count of non nan values of DataFrame column
import numpy as np
from pyspark.sql.functions import isnan
data = [(1,340.0),(1,None),(3,200.0),(4,np.NAN)]
df =spark.createDataFrame(data,["id","number"])
df.show()
print(df.filter(~df.number.contains('None') & \
~df.number.contains('NULL') & \
~isnan(df.number) & \
~df.number.isNull()
).count())
#2
Complete Example
Following is complete example of count of non null & nan values of DataFrame columns.
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [(None,"CA"),("Julia",""),("Ram",None),("Ramya","NULL")]
df =spark.createDataFrame(data,["name","state"])
df.show()
from pyspark.sql.functions import col, when, count,isnan
print(df.filter(~isnan("name") & col("name").isNotNull()).count())
#Count of non-null for all columns
from pyspark.sql.functions import col, when, count,isnan
df.select([count(when(~isnan(c) & col(c).isNotNull() , c)).alias(c) for c in df.columns]
).show()
#Count of non-null for selected columns
df_Columns=["name","state"]
df.select([count(when(~isnan(c) & col(c).isNotNull(), c)).alias(c) for c in df_Columns]
).show()
#Count of non-null by ignoring null literal values
df2 = df.select([count(when(~col(c).contains('None') & \
~col(c).contains('NULL') & \
(col(c) != '' ) & \
~col(c).isNull(), c
)).alias(c)
for c in df.columns])
df2.show()
#Count of non-nan values
from pyspark.sql.functions import isnan
import numpy as np
data = [(1,np.NaN),(1,None),(3,200.0),(4,np.NAN)]
df =spark.createDataFrame(data,["id","number"])
df.show()
print(df.filter(~df.number.contains('None') & \
~df.number.contains('NULL') & \
~isnan(df.number) & \
~df.number.isNull()
).count())
Conclusion
In this PySpark article, you have learned how to calculate the count of non-null & non-nan values of all DataFrame columns, selected columns using Python example.
References