Initial commit

warchildmd · warchildmd · commit 7309be4f18cb · 2020-07-02T18:09:21.000+03:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,217 @@
+# Created by .ignore support plugin (hsz.mobi)
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+*.iml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
diff --git a/README.md b/README.md
@@ -0,0 +1,15 @@
+RSNA Intracranial Hemorrhage Detection (https://www.kaggle.com/c/rsna-intracranial-hemorrhage-detection) model
+
+ResNeXt + PCA + BiLSTM for 0.04989 on Private Test Dataset
+
+Sequence Metadata Required: https://www.kaggle.com/mihailburduja/rsna-intracranial-sequence-metadata
+
+Slices are resized to 256x256, embedding vector is resized to 120. 
+
+`models.py` contains the CNN and LSTM model
+
+`datasets.py` contains the torch Datasets for CNN and for LSTM model
+
+`train_cnn.py` trains the CNN and outputs PCA embeddings and predictions
+
+`train_lstm.py` train the LSTM and outputs the submission file 
diff --git a/datasets.py b/datasets.py
@@ -0,0 +1,117 @@
+# from apex import amp
+import numpy as np
+import pandas as pd
+import pydicom
+import torch
+from torch.utils.data import Dataset
+
+
+def correct_dcm(dcm):
+    x = dcm.pixel_array + 1000
+    px_mode = 4096
+    x[x >= px_mode] = x[x >= px_mode] - px_mode
+    dcm.PixelData = x.tobytes()
+    dcm.RescaleIntercept = -1000
+
+
+def window_image(dcm, window_center, window_width):
+    if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -100):
+        correct_dcm(dcm)
+
+    img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept
+    img_min = window_center - window_width // 2
+    img_max = window_center + window_width // 2
+    img = np.clip(img, img_min, img_max)
+
+    return img
+
+
+def bsb_window(dcm):
+    brain_img = window_image(dcm, 40, 80)
+    subdural_img = window_image(dcm, 80, 200)
+    soft_img = window_image(dcm, 40, 380)
+
+    brain_img = (brain_img - 0) / 80
+    subdural_img = (subdural_img - (-20)) / 200
+    soft_img = (soft_img - (-150)) / 380
+    bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1, 2, 0)
+
+    return bsb_img
+
+
+class IntracranialDataset(Dataset):
+
+    def __init__(self, csv_file, path, labels, transform=None):
+        self.path = path
+        self.data = pd.read_csv(csv_file)
+        self.transform = transform
+        self.labels = labels
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        try:
+            dicom = pydicom.dcmread(self.path + self.data.loc[idx, 'Image'] + '.dcm')
+            img = bsb_window(dicom)
+        except:
+            img = np.zeros((512, 512, 3))
+
+        if self.transform:
+            augmented = self.transform(image=img)
+            img = augmented['image']
+
+        if self.labels:
+
+            labels = torch.tensor(
+                self.data.loc[
+                    idx, ['epidural', 'intraparenchymal', 'intraventricular', 'subarachnoid', 'subdural', 'any']])
+            return {'image': img, 'labels': labels}
+
+        else:
+
+            return {'image': img}
+
+
+class PredictionsDataset(Dataset):
+
+    def __init__(self, data, col_names, features=120, train=True, series=None):
+        self.data = data
+        self.train = train
+        self.col_names = col_names
+        self.embed_cols = [str(i) for i in range(features)]
+
+        if series is None:
+            self.series = self.data['SeriesInstanceUID'].unique()
+        else:
+            self.series = series
+
+    def __len__(self):
+        return len(self.series)
+
+    def __getitem__(self, idx):
+        series_id = self.series[idx]
+        images = self.data[self.data['SeriesInstanceUID'] == series_id].sort_values(by=['ImagePositionSpan', 'ImageId'])
+
+        cols = self.col_names
+        if self.train:
+            cols = [x + '_x' for x in self.col_names]
+
+        image_preds = images[cols].to_numpy().astype(np.float)
+
+        if self.train:
+            image_truths = images[[x + '_y' for x in self.col_names]].to_numpy().astype(np.float)
+
+            image_embeds = images[self.embed_cols].to_numpy().astype(np.float)
+
+            return {
+                'preds': torch.tensor(image_preds).to(torch.float),
+                'labels': torch.tensor(image_truths).to(torch.float),
+                'embeds': torch.tensor(image_embeds).to(torch.float)
+            }
+        else:
+            image_embeds = images[self.embed_cols].to_numpy().astype(np.float)
+            return {
+                'preds': torch.tensor(image_preds).to(torch.float),
+                'embeds': torch.tensor(image_embeds).to(torch.float)
+            }
diff --git a/models.py b/models.py
@@ -0,0 +1,45 @@
+# from apex import amp
+import torch
+
+
+class ResNeXtModel(torch.nn.Module):
+    def __init__(self):
+        super(ResNeXtModel, self).__init__()
+        resnext = torch.hub.load('facebookresearch/WSL-Images', 'resnext101_32x8d_wsl')
+        self.base = torch.nn.Sequential(*list(resnext.children())[:-1])
+        self.fc = torch.nn.Sequential(
+            torch.nn.Linear(2048, 6)
+        )
+
+    def forward(self, input):
+        features = self.base(input).reshape(-1, 2048)
+        out = self.fc(features)
+        return out, features
+
+
+class EmbeddingSmootherModel(torch.nn.Module):
+
+    def __init__(self, features=120, hidden_size=256):
+        super(EmbeddingSmootherModel, self).__init__()
+        self.hidden_size = hidden_size
+        self.lstm = torch.nn.LSTM(features + 6, self.hidden_size, num_layers=3, dropout=0.3, batch_first=True,
+                                  bidirectional=True)
+        self.scan_rnn = torch.nn.GRU(6, 64, num_layers=1, batch_first=True, bidirectional=True)
+        self.classifier = torch.nn.Sequential(
+            torch.nn.Linear(self.hidden_size * 2 + 6, 6)
+        )
+        self.dropout = torch.nn.Dropout(0.5)
+
+    def forward(self, seq, preds):
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+        hidden = (
+            torch.zeros(6, 1, self.hidden_size).to(device),
+            torch.zeros(6, 1, self.hidden_size).to(device)
+        )
+
+        out, hidden = self.lstm(seq, hidden)
+        combined_out = torch.cat((out, preds), 2)
+        out = self.classifier(self.dropout(combined_out))
+
+        return out
diff --git a/train_cnn.py b/train_cnn.py
diff --git a/train_lstm.py b/train_lstm.py