Commit 7e2478b7 authored by lindawangg's avatar lindawangg

cleaned up files

parent e7cc17e6
......@@ -9,7 +9,8 @@
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import random "
"import random \n",
"from shutil import copyfile"
]
},
{
......@@ -18,8 +19,26 @@
"metadata": {},
"outputs": [],
"source": [
"from shutil import copyfile\n",
"savepath = 'data'"
"# set parameters here\n",
"savepath = 'data'\n",
"seed = 0\n",
"np.random.seed(seed). # Reset the seed so all runs are the same.\n",
"random.seed(seed)\n",
"MAXVAL = 255 # Range [0 255]\n",
"\n",
"# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset\n",
"imgpath = '../covid-chestxray-dataset/images' \n",
"csvpath = '../covid-chestxray-dataset/metadata.csv'\n",
"\n",
"# path to kaggle chest xray data from https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia\n",
"data_path = 'chest_xray'\n",
"\n",
"# parameters for COVIDx dataset\n",
"train = []\n",
"test = []\n",
"split = 0.1 # train/test split\n",
"test_count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}\n",
"train_count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}"
]
},
{
......@@ -29,16 +48,8 @@
"outputs": [],
"source": [
"# adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814\n",
"seed = 0\n",
"np.random.seed(seed) # Reset the seed so all runs are the same.\n",
"random.seed(seed)\n",
"MAXVAL = 255 # Range [0 255]\n",
"\n",
"imgpath = '../covid-chestxray-dataset/images'\n",
"csvpath = '../covid-chestxray-dataset/metadata.csv'\n",
"csv = pd.read_csv(csvpath, nrows=None)\n",
"# Keep only the PA view.\n",
"idx_pa = csv[\"view\"] == \"PA\"\n",
"idx_pa = csv[\"view\"] == \"PA\" # Keep only the PA view\n",
"csv = csv[idx_pa]\n",
"\n",
"pneumonias = [\"COVID-19\", \"SARS\", \"MERS\", \"ARDS\", \"Streptococcus\"]\n",
......@@ -67,7 +78,8 @@
}
],
"source": [
"# data from https://github.com/ieee8023/covid-chestxray-dataset/blob/master/metadata.csv\n",
"# get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset\n",
"# stored as patient id, image filename and label\n",
"filename_label = {'normal': [], 'viral': [], 'bacteria': [], 'COVID-19': []}\n",
"count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}\n",
"for index, row in csv.iterrows():\n",
......@@ -77,8 +89,8 @@
" entry = [int(row['Patientid']), row['filename'], mapping[f]]\n",
" filename_label[mapping[f]].append(entry)\n",
"\n",
"print(count)\n",
"print(len(filename_label['COVID-19']))"
"print('Data distribution from covid-chestxray-dataset:')\n",
"print(count)"
]
},
{
......@@ -102,12 +114,13 @@
}
],
"source": [
"# split train/test by patientid\n",
"train = []\n",
"test = []\n",
"split = 0.1\n",
"test_count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}\n",
"train_count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}\n",
"# add covid-chestxray-dataset into COVIDx dataset\n",
"# since covid-chestxray-dataset doesn't have test dataset\n",
"# split into train/test by patientid\n",
"# for COVIDx:\n",
"# patient 8 is used as non-COVID19 viral test\n",
"# patient 31 is used as bacterial test\n",
"# patients 19, 20, 36, 42 are used as COVID-19 viral test\n",
"\n",
"for key in filename_label.keys():\n",
" arr = np.array(filename_label[key])\n",
......@@ -141,8 +154,7 @@
"metadata": {},
"outputs": [],
"source": [
"# data from https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia\n",
"data_path = 'chest_xray'\n",
"# add kaggle chest xray data into COVID19\n",
"folders = ['train', 'val', 'test']\n",
"\n",
"# train, val, test normal data\n",
......@@ -213,21 +225,14 @@
"outputs": [],
"source": [
"# export to train and test csv\n",
"# patientid, filename, label\n",
"# format as patientid, filename, label, separated by a space\n",
"train_file = open(\"train_split.txt\",\"a\") \n",
"for sample in train:\n",
" info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\\n'\n",
" train_file.write(info)\n",
"\n",
"train_file.close()"
]
},
{
"cell_type": "code",
"execution_count": 247,
"metadata": {},
"outputs": [],
"source": [
"train_file.close()\n",
"\n",
"test_file = open(\"test_split.txt\", \"a\")\n",
"for sample in test:\n",
" info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\\n'\n",
......@@ -235,13 +240,6 @@
"\n",
"test_file.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
......
......@@ -19,8 +19,11 @@
"metadata": {},
"outputs": [],
"source": [
"# Set parameters here \n",
"INPUT_SIZE = (224, 224)\n",
"mapping = {'normal': 0, 'bacteria': 1, 'viral': 2, 'COVID-19': 3}"
"mapping = {'normal': 0, 'bacteria': 1, 'viral': 2, 'COVID-19': 3}\n",
"train_filepath = 'train_split.txt'\n",
"test_filepath = 'test_split.txt'"
]
},
{
......@@ -29,9 +32,7 @@
"metadata": {},
"outputs": [],
"source": [
"train_filepath = 'train_split.txt'\n",
"test_filepath = 'test_split.txt'\n",
"\n",
"# load in the train and test files\n",
"file = open(train_filepath, 'r') \n",
"trainfiles = file.readlines() \n",
"file = open(test_filepath, 'r')\n",
......@@ -53,8 +54,8 @@
}
],
"source": [
"print(len(trainfiles))\n",
"print(len(testfiles))"
"print('Total samples for train: ', len(trainfiles))\n",
"print('Total samples for test: ', len(testfiles))"
]
},
{
......@@ -72,6 +73,8 @@
}
],
"source": [
"# load in images\n",
"# resize to input size and normalize to 0 - 1\n",
"x_train = []\n",
"x_test = []\n",
"y_train = []\n",
......@@ -86,7 +89,7 @@
" x_test.append(img)\n",
" y_test.append(mapping[test_i[2]])\n",
"\n",
"print(x_test[0].shape)\n",
"print('Shape of test images: ', x_test[0].shape)\n",
"\n",
"for i in range(len(trainfiles)):\n",
" train_i = trainfiles[i].split()\n",
......@@ -97,7 +100,7 @@
" x_train.append(img)\n",
" y_train.append(mapping[train_i[2]])\n",
"\n",
"print(x_train[0].shape)"
"print('Shape of train images: ', x_train[0].shape)"
]
},
{
......@@ -106,7 +109,7 @@
"metadata": {},
"outputs": [],
"source": [
"# export\n",
"# export to npy to load in for training\n",
"np.save('data/x_train.npy', x_train)\n",
"np.save('data/y_train.npy', y_train)\n",
"np.save('data/x_test.npy', x_test)\n",
......@@ -123,9 +126,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python (tf1.15)",
"display_name": "Python (covid)",
"language": "python",
"name": "tf1.15"
"name": "covid"
},
"language_info": {
"codemirror_mode": {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment