Site icon Spark By {Examples}

PySpark Count of Non null, nan Values in DataFrame

PySpark count non null values

Problem: Could you please explain how to get a count of non null and non nan values of all columns, selected columns from DataFrame with Python examples?

Solution: In order to find non-null values of PySpark DataFrame columns, we need to use negate of isNotNull() function for example ~df.name.isNotNull() similarly for non-nan values ~isnan(df.name).

Note: In Python None is equal to null value, son on PySpark DataFrame None values are shown as null

Let’s create a DataFrame with some null values.


from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [(None,"CA"),("Julia",""),("Ram",None),("Ramya","NULL")]
df =spark.createDataFrame(data,["name","state"])
df.show()

+-----+-----+
| name|state|
+-----+-----+
| null|   CA|
|Julia|     |
|  Ram| null|
|Ramya| NULL|
+-----+-----+

PySpark Count of non null values


#Find count of non null value of a column
from pyspark.sql.functions import col
print(df.filter(col("name").isNotNull()).count())
#3

Count of non null values of all DataFrame Columns


#Count of non-null for all columns
from pyspark.sql.functions import col, when, count
df.select([count(when(col(c).isNotNull() , c)).alias(c) for c in df.columns]
   ).show()

+----+-----+
|name|state|
+----+-----+
|   3|    3|
+----+-----+

Count of non null values of selected columns


#Count of non-null for selected columns
from pyspark.sql.functions import col, when, count
df_Columns=["name","state"]
df.select([count(when(col(c).isNotNull(), c)).alias(c) for c in df_Columns]
   ).show()

Let’s see how to ignore NULL literal string value.


#Count of non-null by ignoring null literal values
from pyspark.sql.functions import col, when, count
df2 = df.select([count(when(~col(c).contains('None') & \
                            ~col(c).contains('NULL') & \
                            (col(c) != '' ) & \
                            ~col(c).isNull(), c 
                           )).alias(c)
                    for c in df.columns])
df2.show()

+----+-----+
|name|state|
+----+-----+
|   3|    1|
+----+-----+

PySpark Count of non NaN Values of DataFrame column

Below example demonstrates how to get a count of non Nan Values of a PySpark DataFrame column.


#Find count of non nan values of DataFrame column
import numpy as np
from pyspark.sql.functions import isnan
data = [(1,340.0),(1,None),(3,200.0),(4,np.NAN)]
df =spark.createDataFrame(data,["id","number"])
df.show()
print(df.filter(~df.number.contains('None') & \
                ~df.number.contains('NULL') & \
                ~isnan(df.number) & \
                ~df.number.isNull()
            ).count())

#2

Complete Example

Following is complete example of count of non null & nan values of DataFrame columns.


from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
data = [(None,"CA"),("Julia",""),("Ram",None),("Ramya","NULL")]
df =spark.createDataFrame(data,["name","state"])
df.show()

from pyspark.sql.functions import col, when, count,isnan
print(df.filter(~isnan("name") & col("name").isNotNull()).count())

#Count of non-null for all columns
from pyspark.sql.functions import col, when, count,isnan
df.select([count(when(~isnan(c) & col(c).isNotNull() , c)).alias(c) for c in df.columns]
   ).show()

#Count of non-null for selected columns
df_Columns=["name","state"]
df.select([count(when(~isnan(c) & col(c).isNotNull(), c)).alias(c) for c in df_Columns]
   ).show()

#Count of non-null by ignoring null literal values
df2 = df.select([count(when(~col(c).contains('None') & \
                            ~col(c).contains('NULL') & \
                            (col(c) != '' ) & \
                            ~col(c).isNull(), c 
                           )).alias(c)
                    for c in df.columns])
df2.show()

#Count of non-nan values
from pyspark.sql.functions import isnan
import numpy as np
data = [(1,np.NaN),(1,None),(3,200.0),(4,np.NAN)]
df =spark.createDataFrame(data,["id","number"])
df.show()
print(df.filter(~df.number.contains('None') & \
                ~df.number.contains('NULL') & \
                ~isnan(df.number) & \
                ~df.number.isNull()
            ).count())

Conclusion

In this PySpark article, you have learned how to calculate the count of non-null & non-nan values of all DataFrame columns, selected columns using Python example.

References

Exit mobile version