pip install pydeequ

wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFiles/Video_Games.json.gz
gunzip Video_Games.json.gz

ddf = spark.read.json("Video_Games.json")

ddf = ddf.drop('helpful','reviewText','reviewTime','reviewerName','unixReviewTime','summary')```
ddf = ddf.selectExpr("asin","overall as rating", "reviewerID as user")

ddf = ddf.sample(0.01)

<!--Listing 1 - Analysis Runner-->
analysisResult = AnalysisRunner(spark) \
  .onData(ddf) \
  .addAnalyzer(Size()) \
  .addAnalyzer(Completeness("user")) \
  .addAnalyzer(ApproxCountDistinct("user")) \
  .addAnalyzer(Mean("rating")) \
  .addAnalyzer(Compliance("top rating", "rating >= 4.0")) \
  .addAnalyzer(Correlation("total_rating", "top rating")) \
  .run()
<!--Listing 1 Ende-->

analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()

<!--Listing 3 - Ausführen eines ConstraintSuggestionRunner und Ausgabe im JSON-Format-->
suggestionResult = ConstraintSuggestionRunner(spark) \
  .onData(ddf) \
  .addConstraintRule(DEFAULT()) \
  .run()
print(json.dumps(suggestionResult, indent=2))
<!--Listing 3 Ende-->

<!--Listing 4 - Ausgabe aller Empfehlungen für Überprüfungsregeln der Videospieldaten-->
{
  "constraint_suggestions": [
    {
      "constraint_name": "CompletenessConstraint(Completeness(rating,None))",
      "column_name": "rating",
      "current_value": "Completeness: 1.0",
      "description": "'rating' is not null",
      "suggesting_rule": "CompleteIfCompleteRule()",
      "rule_description": "If a column is complete in the sample, we suggest a NOT NULL constraint",
      "code_for_constraint": ".isComplete(\"rating\")"
    },
    {
      "constraint_name": "ComplianceConstraint(Compliance('rating' has no negative values,rating >= 0,None))",
      "column_name": "rating",
      "current_value": "Minimum: 1.0",
      "description": "'rating' has no negative values",
      "suggesting_rule": "NonNegativeNumbersRule()",
      "rule_description": "If we see only non-negative numbers in a column, we suggest a corresponding constraint",
      "code_for_constraint": ".isNonNegative(\"rating\")"
    },
    {
      "constraint_name": "CompletenessConstraint(Completeness(user,None))",
      "column_name": "user",
      "current_value": "Completeness: 1.0",
      "description": "'user' is not null",
      "suggesting_rule": "CompleteIfCompleteRule()",
      "rule_description": "If a column is complete in the sample, we suggest a NOT NULL constraint",
      "code_for_constraint": ".isComplete(\"user\")"
    },
    {
      "constraint_name": "UniquenessConstraint(Uniqueness(List(user),None))",
      "column_name": "user",
      "current_value": "ApproxDistinctness: 0.976625060028345",
      "description": "'user' is unique",
      "suggesting_rule": "UniqueIfApproximatelyUniqueRule()",
      "rule_description": "If the ratio of approximate num distinct values in a column is close to the number of records (within the error of the HLL sketch), we suggest a UNIQUE constraint",
      "code_for_constraint": ".isUnique(\"user\")"
    },
    {
      "constraint_name": "CompletenessConstraint(Completeness(asin,None))",
      "column_name": "asin",
      "current_value": "Completeness: 1.0",
      "description": "'asin' is not null",
      "suggesting_rule": "CompleteIfCompleteRule()",
      "rule_description": "If a column is complete in the sample, we suggest a NOT NULL constraint",
      "code_for_constraint": ".isComplete(\"asin\")"
    }
  ]
}
<!--Ende Listing 4-->

<!--Listing 5 - Initialisieren von Checks im Spark-Dataframe-->
check = Check(spark, CheckLevel.Warning, "Video Game Review Check")

checkResult = VerificationSuite(spark) \
  .onData(ddf) \
  .addCheck(
    check.hasSize(lambda x: x >= 2000000) \
      .hasMin("rating", lambda x: x == 1.0) \
      .hasMax("rating", lambda x: x == 5.0)  \
      .isComplete("user")  \
      .isUnique("user")  \
  ).run()

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show()
<!--Ende Listing 5-->
