update logic for mean in lstm

thatgeeman · thatgeeman · commit c6aec86db7b3 · 2024-12-01T14:07:22.000+01:00
diff --git a/nbs/02_lstm.ipynb b/nbs/02_lstm.ipynb
@@ -799,7 +799,9 @@
    "source": [
     "### Get statistics of embeddings (train)\n",
     "\n",
-    "Mean of training data to standardize the embeddings. These are global statistics for the train set."
+    "Mean of training data to standardize the embeddings. These are global statistics for the train set.\n",
+    "\n",
+    "Take mean up to the `:-1` index as this is only seen by model."
    ]
   },
   {
@@ -820,10 +822,10 @@
    ],
    "source": [
     "window_means = np.asarray(\n",
-    "    [data_embeddings[i][\"subset\"].mean().item() for i in trn_data_idxs]\n",
+    "    [data_embeddings[i][\"subset\"][:-1].mean().item() for i in trn_data_idxs]\n",
     ")\n",
     "window_stds = np.asarray(\n",
-    "    [data_embeddings[i][\"subset\"].std().item() for i in trn_data_idxs]\n",
+    "    [data_embeddings[i][\"subset\"][:-1].std().item() for i in trn_data_idxs]\n",
     ")\n",
     "emb_mean, emb_std = window_means.mean(), window_stds.mean()\n",
     "emb_mean, emb_std"
@@ -854,10 +856,10 @@
    ],
    "source": [
     "val_window_means = np.asarray(\n",
-    "    [data_embeddings[i][\"subset\"].mean().item() for i in val_data_idxs]\n",
+    "    [data_embeddings[i][\"subset\"][:-1].mean().item() for i in val_data_idxs]\n",
     ")\n",
     "val_window_stds = np.asarray(\n",
-    "    [data_embeddings[i][\"subset\"].std().item() for i in val_data_idxs]\n",
+    "    [data_embeddings[i][\"subset\"][:-1].std().item() for i in val_data_idxs]\n",
     ")\n",
     "val_emb_mean, val_emb_std = val_window_means.mean(), val_window_stds.mean()\n",
     "val_emb_mean, val_emb_std"
diff --git a/scripts/train_lstm.py b/scripts/train_lstm.py
@@ -80,6 +80,7 @@ def get_embedding_windows(data, vae, cfg):
     logging.info(
         f"From {len(data_windowed)} windows, {len(embeddings)} Embeddings generated using VAE."
     )
+
     return embeddings
 
 
@@ -246,10 +247,17 @@ def main(cfg):
     np.random.shuffle(trn_data_idxs)
     logging.info(f"Train embedding indices: {len(trn_data_idxs)}")
     # calculate mean and std of embeddings, should be very close to 0, 1 as sampler of VAE is Normal
-    window_means = np.asarray([emb[i]["subset"].mean().item() for i in trn_data_idxs])
-    window_stds = np.asarray([emb[i]["subset"].std().item() for i in trn_data_idxs])
+    # take mean upto x[:-1] as this is the real training size in each subset
+    window_means = np.asarray(
+        [emb[i]["subset"][:-1].mean().item() for i in trn_data_idxs]
+    )
+    window_stds = np.asarray(
+        [emb[i]["subset"][:-1].std().item() for i in trn_data_idxs]
+    )
+    # mean of means
     emb_mean, emb_std = window_means.mean(), window_stds.mean()
     logging.info(f"Embedding mean and std of train: {emb_mean} ({emb_std})")
+    params.update({"means": emb_mean, "stds": emb_std})
 
     dset_trn = TSLSTMDataset(
         emb,