Simulator dependency update (#71)

d-vct · dianevct · ehudkr · web-flow · commit 52ba5eaae54f · 2024-07-31T10:57:56.000+03:00
Update pandas>2 and networkx>3 for simulation module and some unit tests #71 Add unit tests. Github Actions skip on error for Code Climate failures to avoid failing the entire workflow (even if tests are ok) just because it didn't have access to CC token for uploading coverage report. --------------------------------------------------------- * Allow networkx >3 dependency --------- Signed-off-by: Diane Vincent <diane.vincent78@gmail.com> * Allow pandas>2 dependency * Add tests Different link types: - test_affine_linking - test_poly_linking - test_exp_linking - test_log_linking Effect modifier: check that it behaves correctly with marginal structural model - test_effect_modifier * Dummy commit to engage CodeClimate? It seems forked-branch pull-request do not initiate CodeClimate properly, causing the entire PR look like it failed. The problem is that the forked PR doesn't have access to the upstream's secret, so CodeClimate doesn't have its token: https://github.com/BiomedSciAI/causallib/actions/runs/10010566187/job/27714442507?pr=71#step:8:17 Before I contemplate whether to make that not-really-secret secret a hardcoded token instead, I want to test whether making a dummy commit by a permitted account could make it run properly. Signed-off-by: Ehud-Karavani <ehud.karavani@ibm.com> * Don't fail entire pipeline for failed coverage report upload Forked PRs have no access to secrets, so uploading a coverage report to Code Climate can fail as no token will be provided. To avoid that failing the entire workflow, try to make that step optional and see what happens on Github Actions. Signed-off-by: Ehud-Karavani <ehud.karavani@ibm.com> --------- Signed-off-by: Diane Vincent <diane.vincent78@gmail.com> Signed-off-by: Ehud-Karavani <ehud.karavani@ibm.com> Co-authored-by: Diane Vincent <diane.vincent78@gmail.com> Co-authored-by: ehudkr <ehudkaravani@gmail.com> Co-authored-by: Ehud Karavani <15989012+ehudkr@users.noreply.github.com> Co-authored-by: Ehud-Karavani <ehud.karavani@ibm.com>
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -43,5 +43,8 @@ jobs:
       - name: Publish to CodeClimate
         uses: paambaati/codeclimate-action@v5.0.0
         env:
-          CC_TEST_REPORTER_ID: ${{ secrets.CODECLIMATE_REPORTER_ID }} 
+          CC_TEST_REPORTER_ID: ${{ secrets.CODECLIMATE_REPORTER_ID }}
+        # Forked PRs have no access to secrets, so uploading a coverage report to Code Climate fails.
+        # To avoid that failing the entire workflow, continue on error:
+        continue-on-error: true
           
diff --git a/causallib/simulation/CausalSimulator3.py b/causallib/simulation/CausalSimulator3.py
@@ -216,7 +216,7 @@ def __init__(self, topology, var_types, prob_categories, link_types, snr, treatm
 
         # check that effect modifier is independent on treatment and affects only the outcome:
         for i in self.effmod_indices:
-            successors = self.graph_topology.successors(i)
+            successors = list(self.graph_topology.successors(i))
             if len(successors) == 0 or self.outcome_indices.intersection(successors).size < 1:
                 raise ValueError("Effect modifier variable {name} must affect an outcome variable".format(name=i))
             ancestors = nx.ancestors(self.graph_topology, i)
@@ -441,7 +441,7 @@ def generate_data(self, X_given=None, num_samples=None, random_seed=None):
 
         # generate latent continuous covariates - every variable is guaranteed to have a population variance of 1.0
         # X_latent = pd.DataFrame(index=patients_index, columns=self.var_types.index)
-        X = pd.DataFrame(index=patients_index, columns=self.var_types.index)
+        X = pd.DataFrame(index=patients_index, columns=self.var_types.index, dtype=float)
         if X_given is not None:  # if a dataset is given, integrate it to the current dataset being build.
             X.loc[:, X_given.columns] = X_given
             for col in X_given.columns:
@@ -1342,7 +1342,7 @@ def _poly_linking(X_parents, beta=None):
             beta = pd.DataFrame(data=np.random.normal(loc=0.0, scale=4.0, size=(degree, X_parents.columns.size)),
                                 columns=X_parents.columns, index=np.arange(degree))
 
-        result_polynomial = pd.DataFrame(data=None, index=X_parents.index, columns=X_parents.columns)
+        result_polynomial = pd.DataFrame(data=None, index=X_parents.index, columns=X_parents.columns, dtype=float)
         degrees = beta.index.to_series()
         # Apply a polynomial to every parent variable
         for var_name, col in X_parents.items():
diff --git a/causallib/tests/test_causal_simulator3.py b/causallib/tests/test_causal_simulator3.py
@@ -357,6 +357,82 @@ def test_linear_linking(self):
                          msg="discovered rank of matrix is {emp} instead of {des}."
                              "so the linear linking does not work properly".format(emp=rank, des=2))
 
+    def test_affine_linking(self):
+        topology = np.zeros((3, 3), dtype=bool)
+        topology[2, 0] = topology[2, 1] = True
+        var_types = ["covariate", "treatment", "outcome"]
+        snr = 1
+        prob_cat = [None, [0.5, 0.5], None]
+        treatment_importance = None
+        sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat,
+                  link_types="affine", treatment_importances=treatment_importance,
+                  outcome_types=self.no_X.outcome_types, snr=snr, effect_sizes=self.no_X.effect_sizes)
+        X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
+
+        singular_values = np.linalg.svd(X.astype(float).values, compute_uv=False)
+        eps = 1e-10
+        rank = np.sum(singular_values > eps)
+        self.assertEqual(rank, 3,
+                         msg="discovered rank of matrix is {emp} instead of {des}."
+                             "so the affine linking does not work properly".format(emp=rank, des=3))
+
+    def test_poly_linking(self):
+        topology = np.zeros((3, 3), dtype=bool)
+        topology[2, 0] = topology[2, 1] = True
+        var_types = ["covariate", "treatment", "outcome"]
+        snr = 1
+        prob_cat = [None, [0.5, 0.5], None]
+        treatment_importance = None
+        sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat,
+                  link_types="poly", treatment_importances=treatment_importance,
+                  outcome_types=self.no_X.outcome_types, snr=snr, effect_sizes=self.no_X.effect_sizes)
+        X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
+
+        singular_values = np.linalg.svd(X.astype(float).values, compute_uv=False)
+        eps = 1e-10
+        rank = np.sum(singular_values > eps)
+        self.assertEqual(rank, 3,
+                         msg="discovered rank of matrix is {emp} instead of {des}."
+                             "so the poly linking does not work properly".format(emp=rank, des=3))
+
+    def test_exp_linking(self):
+        topology = np.zeros((3, 3), dtype=bool)
+        topology[2, 0] = topology[2, 1] = True
+        var_types = ["covariate", "treatment", "outcome"]
+        snr = 1
+        prob_cat = [None, [0.5, 0.5], None]
+        treatment_importance = None
+        sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat,
+                  link_types="exp", treatment_importances=treatment_importance,
+                  outcome_types=self.no_X.outcome_types, snr=snr, effect_sizes=self.no_X.effect_sizes)
+        X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
+
+        singular_values = np.linalg.svd(X.astype(float).values, compute_uv=False)
+        eps = 1e-10
+        rank = np.sum(singular_values > eps)
+        self.assertEqual(rank, 3,
+                         msg="discovered rank of matrix is {emp} instead of {des}."
+                             "so the exp linking does not work properly".format(emp=rank, des=3))
+
+    def test_log_linking(self):
+        topology = np.zeros((3, 3), dtype=bool)
+        topology[2, 0] = topology[2, 1] = True
+        var_types = ["covariate", "treatment", "outcome"]
+        snr = 1
+        prob_cat = [None, [0.5, 0.5], None]
+        treatment_importance = None
+        sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat,
+                  link_types="log", treatment_importances=treatment_importance,
+                  outcome_types=self.no_X.outcome_types, snr=snr, effect_sizes=self.no_X.effect_sizes)
+        X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
+
+        singular_values = np.linalg.svd(X.astype(float).values, compute_uv=False)
+        eps = 1e-10
+        rank = np.sum(singular_values > eps)
+        self.assertEqual(rank, 3,
+                         msg="discovered rank of matrix is {emp} instead of {des}."
+                             "so the log linking does not work properly".format(emp=rank, des=3))
+
     def test_treatment_logistic(self):
         topology = np.zeros((6, 6), dtype=bool)
         topology[2, 0] = topology[3, 0] = topology[2, 1] = topology[3, 1] = topology[4, 2] = topology[5, 3] = True
@@ -533,6 +609,26 @@ def test_censoring(self):
         # TODO: test different link types
         # TODO: test marginal structural model (both in continuous, dichotomous and probability settings)
 
+    def test_effect_modifier(self):
+        topology = np.zeros((4, 4), dtype=bool)
+        topology[2, 0] = topology[2, 1] = topology[2, 3] = True
+        var_types = ["effect_modifier", "treatment", "outcome", "covariate"]
+        snr = 1
+        prob_cat = [None, [0.5, 0.5], None, None]
+        treatment_importance = None
+        sim = CS3(topology=topology, var_types=var_types, prob_categories=prob_cat,
+                  link_types=["linear","linear","marginal_structural_model","linear"], treatment_importances=treatment_importance,
+                  outcome_types="continuous", snr=snr, effect_sizes=None)
+        X, prop, cf = sim.generate_data(num_samples=self.NUM_SAMPLES)
+        
+        beta = sim.linking_coefs
+        self.assertNotEqual(beta[2].loc[0,0], beta[2].loc[0,1],
+                         msg="coefficients for potential outcomes are the same: {beta_1} = {beta_0}."
+                             "so the effect modifier does not behave properly".format(beta_0=beta[2].loc[0,0], beta_1=beta[2].loc[0,1]))
+        self.assertEqual(beta[2].loc[3,0], beta[2].loc[3,1],
+                         msg="coefficients for potential outcomes are not the same: {beta_1} != {beta_0}."
+                             "so the covariate does not behave properly".format(beta_0=beta[2].loc[0,0], beta_1=beta[2].loc[0,1]))
+
 
 if __name__ == "__main__":
     unittest.main()