Commit ad90243e authored by lindawangg's avatar lindawangg

removed archive folder

parent 304b4cfa
......@@ -16,3 +16,4 @@ data.py
export_to_meta.py
model.py
train_tf.py
archive/
......@@ -191,7 +191,7 @@ These are the final results for COVID-Net Small and COVID-Net Large.
## Pretrained Models
| Type | COVID-19 Sensitivity | # Params (M) | Model |
|:-----:|:--------------------:|:------------:|:-------------------:|
| ckpt | 80.0 | 116 |[COVID-Net Small](https://drive.google.com/file/d/1djqWcxzRehtyJV9EQsppj1YdgsP2JRQy/view?usp=sharing)|
| ckpt | 90.0 | 126 |[COVID-Net Large](https://drive.google.com/file/d/1xrxK9swFVlFI-WAYcccIgm0tt9RgawXD/view?usp=sharing)|
| Type | COVID-19 Sensitivity | # Params (M) | MACs (G) | Model |
|:-----:|:--------------------:|:------------:|:--------:|:-------------------:|
| ckpt | 80.0 | 116.6 | 2.26 |[COVID-Net Small](https://drive.google.com/file/d/1djqWcxzRehtyJV9EQsppj1YdgsP2JRQy/view?usp=sharing)|
| ckpt | 90.0 | 126.6 | 3.59 |[COVID-Net Large](https://drive.google.com/file/d/1xrxK9swFVlFI-WAYcccIgm0tt9RgawXD/view?usp=sharing)|
{
"cells": [
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import os\n",
"import random \n",
"from shutil import copyfile"
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {},
"outputs": [],
"source": [
"# set parameters here\n",
"savepath = 'data'\n",
"seed = 0\n",
"np.random.seed(seed). # Reset the seed so all runs are the same.\n",
"random.seed(seed)\n",
"MAXVAL = 255 # Range [0 255]\n",
"\n",
"# path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset\n",
"imgpath = '../covid-chestxray-dataset/images' \n",
"csvpath = '../covid-chestxray-dataset/metadata.csv'\n",
"\n",
"# path to kaggle chest xray data from https://www.kaggle.com/paultimothymooney/chest-xray-pneumonia\n",
"data_path = 'chest_xray'\n",
"\n",
"# parameters for COVIDx dataset\n",
"train = []\n",
"test = []\n",
"split = 0.1 # train/test split\n",
"test_count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}\n",
"train_count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}"
]
},
{
"cell_type": "code",
"execution_count": 162,
"metadata": {},
"outputs": [],
"source": [
"# adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision/datasets.py#L814\n",
"csv = pd.read_csv(csvpath, nrows=None)\n",
"idx_pa = csv[\"view\"] == \"PA\" # Keep only the PA view\n",
"csv = csv[idx_pa]\n",
"\n",
"pneumonias = [\"COVID-19\", \"SARS\", \"MERS\", \"ARDS\", \"Streptococcus\"]\n",
"pathologies = [\"Pneumonia\",\"Viral Pneumonia\", \"Bacterial Pneumonia\", \"No Finding\"] + pneumonias\n",
"pathologies = sorted(pathologies)\n",
"\n",
"mapping = dict()\n",
"mapping['COVID-19'] = 'COVID-19'\n",
"mapping['SARS'] = 'viral'\n",
"mapping['MERS'] = 'viral'\n",
"mapping['Streptococcus'] = 'bacteria'"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'normal': 0, 'viral': 11, 'bacteria': 6, 'COVID-19': 68}\n",
"68\n"
]
}
],
"source": [
"# get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset\n",
"# stored as patient id, image filename and label\n",
"filename_label = {'normal': [], 'viral': [], 'bacteria': [], 'COVID-19': []}\n",
"count = {'normal': 0, 'viral': 0, 'bacteria': 0, 'COVID-19': 0}\n",
"for index, row in csv.iterrows():\n",
" f = row['finding']\n",
" if f in mapping:\n",
" count[mapping[f]] += 1\n",
" entry = [int(row['Patientid']), row['filename'], mapping[f]]\n",
" filename_label[mapping[f]].append(entry)\n",
"\n",
"print('Data distribution from covid-chestxray-dataset:')\n",
"print(count)"
]
},
{
"cell_type": "code",
"execution_count": 243,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Key: viral\n",
"Test patients: ['8']\n",
"Key: bacteria\n",
"Test patients: ['31']\n",
"Key: COVID-19\n",
"Test patients: ['36', '42', '19', '20']\n",
"test count: {'normal': 0, 'viral': 1, 'bacteria': 4, 'COVID-19': 8}\n",
"train count: {'normal': 0, 'viral': 10, 'bacteria': 2, 'COVID-19': 60}\n"
]
}
],
"source": [
"# add covid-chestxray-dataset into COVIDx dataset\n",
"# since covid-chestxray-dataset doesn't have test dataset\n",
"# split into train/test by patientid\n",
"# for COVIDx:\n",
"# patient 8 is used as non-COVID19 viral test\n",
"# patient 31 is used as bacterial test\n",
"# patients 19, 20, 36, 42 are used as COVID-19 viral test\n",
"\n",
"for key in filename_label.keys():\n",
" arr = np.array(filename_label[key])\n",
" if arr.size == 0:\n",
" continue\n",
" # split by patients\n",
" num_diff_patients = len(np.unique(arr[:,0]))\n",
" num_test = max(1, round(split*num_diff_patients))\n",
" # select num_test number of random patients\n",
" test_patients = random.sample(list(arr[:,0]), num_test)\n",
" print('Key: ', key)\n",
" print('Test patients: ', test_patients)\n",
" # go through all the patients\n",
" for patient in arr:\n",
" if patient[0] in test_patients:\n",
" copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'test', patient[1]))\n",
" test.append(patient)\n",
" test_count[patient[2]] += 1\n",
" else:\n",
" copyfile(os.path.join(imgpath, patient[1]), os.path.join(savepath, 'train', patient[1]))\n",
" train.append(patient)\n",
" train_count[patient[2]] += 1\n",
"\n",
"print('test count: ', test_count)\n",
"print('train count: ', train_count)"
]
},
{
"cell_type": "code",
"execution_count": 244,
"metadata": {},
"outputs": [],
"source": [
"# add kaggle chest xray data into COVID19\n",
"folders = ['train', 'val', 'test']\n",
"\n",
"# train, val, test normal data\n",
"for folder in folders: \n",
" for img in os.listdir(os.path.join(data_path, folder, 'NORMAL')):\n",
" if '.jp' in img:\n",
" new_img = img.strip('IM-')\n",
" new_img = new_img.strip('NORMAL2-IM-')\n",
" # add to current dataset\n",
" patientid = '1000' + new_img.split('-')[0] # add 1000 in front of kaggle patient ids\n",
" if folder == 'train' or folder == 'val':\n",
" # copy files to data folder\n",
" copyfile(os.path.join(data_path, folder, 'NORMAL', img), os.path.join(savepath, 'train', img))\n",
" train.append([patientid, img, 'normal'])\n",
" train_count['normal'] += 1\n",
" else:\n",
" copyfile(os.path.join(data_path, folder, 'NORMAL', img), os.path.join(savepath, 'test', img))\n",
" test.append([patientid, img, 'normal'])\n",
" test_count['normal'] += 1\n",
"\n",
"# train, val, test pneumonia data\n",
" for img in os.listdir(os.path.join(data_path, folder, 'PNEUMONIA')):\n",
" if '.jp' in img:\n",
" new_img = img.strip('person')\n",
" patientid = '1000' + new_img.split('_')[0]\n",
" p_type = 'bacteria' if 'bacteria' in new_img else 'viral'\n",
" if folder == 'train' or folder == 'val':\n",
" copyfile(os.path.join(data_path, folder, 'PNEUMONIA', img), os.path.join(savepath, 'train', img))\n",
" train.append([patientid, img, p_type])\n",
" train_count[p_type] += 1\n",
" else:\n",
" copyfile(os.path.join(data_path, folder, 'PNEUMONIA', img), os.path.join(savepath, 'test', img))\n",
" test.append([patientid, img, p_type])\n",
" test_count[p_type] += 1\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 245,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Final stats\n",
"Train count: {'normal': 1349, 'viral': 1355, 'bacteria': 2540, 'COVID-19': 60}\n",
"Test count: {'normal': 234, 'viral': 149, 'bacteria': 246, 'COVID-19': 8}\n",
"Total length of train: 5304\n",
"Total length of test: 637\n"
]
}
],
"source": [
"# final stats\n",
"print('Final stats')\n",
"print('Train count: ', train_count)\n",
"print('Test count: ', test_count)\n",
"print('Total length of train: ', len(train))\n",
"print('Total length of test: ', len(test))"
]
},
{
"cell_type": "code",
"execution_count": 246,
"metadata": {},
"outputs": [],
"source": [
"# export to train and test csv\n",
"# format as patientid, filename, label, separated by a space\n",
"train_file = open(\"train_split.txt\",\"a\") \n",
"for sample in train:\n",
" info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\\n'\n",
" train_file.write(info)\n",
"\n",
"train_file.close()\n",
"\n",
"test_file = open(\"test_split.txt\", \"a\")\n",
"for sample in test:\n",
" info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\\n'\n",
" test_file.write(info)\n",
"\n",
"test_file.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (covid)",
"language": "python",
"name": "covid"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
np.random.seed(0)
def rotate_image(image, angle):
# grab the dimensions of the image and then determine the
# center
(h, w) = image.shape[:2]
(cX, cY) = (w // 2, h // 2)
# grab the rotation matrix (applying the negative of the
# angle to rotate clockwise), then grab the sine and cosine
# (i.e., the rotation components of the matrix)
M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
cos = np.abs(M[0, 0])
sin = np.abs(M[0, 1])
# compute the new bounding dimensions of the image
nW = int((h * sin) + (w * cos))
nH = int((h * cos) + (w * sin))
# adjust the rotation matrix to take into account translation
M[0, 2] += (nW / 2) - cX
M[1, 2] += (nH / 2) - cY
# perform the actual rotation and return the image
return cv2.warpAffine(image, M, (nW, nH))
def horizontal_flip(image):
return cv2.flip(image, 1)
def shift_image(image, lr_pixels, tb_pixels):
num_rows, num_cols = image.shape[:2]
translation_matrix = np.float32([ [1,0,lr_pixels], [0,1,tb_pixels] ])
return cv2.warpAffine(img, translation_matrix, (num_cols, num_rows))
INPUT_SIZE = (224, 224)
mapping = {'normal': 0, 'bacteria': 1, 'viral': 2, 'COVID-19': 3}
train_filepath = 'train_split.txt'
test_filepath = 'test_split.txt'
num_samples = 3000
# load in the train and test files
file = open(train_filepath, 'r')
trainfiles = file.readlines()
file = open(test_filepath, 'r')
testfiles = file.readlines()
# augment all the train class to 3000 examples each
# get number of each class
classes = {'normal': [], 'bacteria': [], 'viral': [], 'COVID-19': []}
img_aug = {'normal': [], 'bacteria': [], 'viral': [], 'COVID-19': []}
classes_test = {'normal': [], 'bacteria': [], 'viral': [], 'COVID-19': []}
for i in range(len(trainfiles)):
train_i = trainfiles[i].split()
classes[train_i[2]].append(train_i[1])
for i in range(len(testfiles)):
test_i = testfiles[i].split()
classes_test[test_i[2]].append(test_i[1])
for key in classes.keys():
print('{}: {}'.format(key, len(classes[key])))
num_to_augment = {'normal': min(num_samples - (len(classes['normal']) + len(img_aug['normal'])), len(classes['normal'])),
'bacteria': min(num_samples - (len(classes['bacteria']) + len(img_aug['normal'])), len(classes['bacteria'])),
'viral': min(num_samples - (len(classes['viral']) + len(img_aug['normal'])), len(classes['viral'])),
'COVID-19': min(num_samples - (len(classes['COVID-19']) + len(img_aug['normal'])), len(classes['COVID-19']))}
print('num_to_augment 1:', num_to_augment)
to_augment = 0
for key in num_to_augment.keys():
to_augment += num_to_augment[key]
print(to_augment)
while to_augment:
for key in classes.keys():
aug_class = classes[key]
# sample which images to augment
sample_indexes = np.random.choice(len(aug_class), num_to_augment[key], replace=False)
for i in sample_indexes:
# randomly select the degree of each augmentation
rot = np.random.uniform(-5, 5)
do_flip = np.random.randint(0, 2)
shift_vert = np.random.randint(0, 2)
shift = np.random.uniform(-10, 10)
# read in image and apply augmentation
img = cv2.imread(os.path.join('data', 'train', aug_class[i]))
#img = rotate_image(img, rot)
#if shift_vert:
# img = shift_image(img, 0, shift)
#else:
# img = shift_image(img, shift, 0)
if do_flip:
img = horizontal_flip(img)
# append filename and class to img_aug, save as png
imgname = '{}.png'.format(aug_class[i].split('.')[0] + '_aug_r' + str(round(rot)) + '_' + str(do_flip) + '_s' + str(shift_vert) + str(round(shift)))
img_aug[key].append(imgname)
cv2.imwrite(os.path.join('data', 'train', imgname), img)
# update num_to_augment numbers
num_to_augment = {
'normal': min(num_samples - (len(classes['normal']) + len(img_aug['normal'])), len(classes['normal'])),
'bacteria': min(num_samples - (len(classes['bacteria']) + len(img_aug['bacteria'])), len(classes['bacteria'])),
'viral': min(num_samples - (len(classes['viral']) + len(img_aug['viral'])), len(classes['viral'])),
'COVID-19': min(num_samples - (len(classes['COVID-19']) + len(img_aug['COVID-19'])), len(classes['COVID-19']))}
to_augment = 0
for key in num_to_augment.keys():
to_augment += num_to_augment[key]
print(num_to_augment)
mapping = {'normal': 0, 'bacteria': 1, 'viral': 2, 'COVID-19': 3}
train_file = open("train_augment.txt","a")
for key in classes.keys():
for imgname in classes[key]:
info = imgname + ' ' + str(mapping[key]) + '\n'
train_file.write(info)
for imgname in img_aug[key]:
info = imgname + ' ' + str(mapping[key]) + '\n'
train_file.write(info)
train_file.close()
test_file = open("test.txt", "a")
for key in classes_test.keys():
for imgname in classes_test[key]:
info = imgname + ' ' + str(mapping[key]) + '\n'
test_file.write(info)
test_file.close()
{
"cells": [
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"import cv2\n",
"import keras\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"# Set parameters here \n",
"INPUT_SIZE = (224, 224)\n",
"mapping = {'normal': 0, 'bacteria': 1, 'viral': 2, 'COVID-19': 3}\n",
"train_filepath = 'train_split.txt'\n",
"test_filepath = 'test_split.txt'"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# load in the train and test files\n",
"file = open(train_filepath, 'r') \n",
"trainfiles = file.readlines() \n",
"file = open(test_filepath, 'r')\n",
"testfiles = file.readlines()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5304\n",
"637\n"
]
}
],
"source": [
"print('Total samples for train: ', len(trainfiles))\n",
"print('Total samples for test: ', len(testfiles))"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(224, 224, 3)\n",
"(224, 224, 3)\n"
]
}
],
"source": [
"# load in images\n",
"# resize to input size and normalize to 0 - 1\n",
"x_train = []\n",
"x_test = []\n",
"y_train = []\n",
"y_test = []\n",
"\n",
"for i in range(len(testfiles)):\n",
" test_i = testfiles[i].split()\n",
" imgpath = test_i[1]\n",
" img = cv2.imread(os.path.join('data', 'test', imgpath))\n",
" img = cv2.resize(img, INPUT_SIZE) # resize\n",
" img = img.astype('float32') / 255.0\n",
" x_test.append(img)\n",
" y_test.append(mapping[test_i[2]])\n",
"\n",
"print('Shape of test images: ', x_test[0].shape)\n",
"\n",
"for i in range(len(trainfiles)):\n",
" train_i = trainfiles[i].split()\n",
" imgpath = train_i[1]\n",
" img = cv2.imread(os.path.join('data', 'train', imgpath))\n",
" img = cv2.resize(img, INPUT_SIZE) # resize\n",
" img = img.astype('float32') / 255.0\n",
" x_train.append(img)\n",
" y_train.append(mapping[train_i[2]])\n",
"\n",
"print('Shape of train images: ', x_train[0].shape)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"# export to npy to load in for training\n",
"np.save('data/x_train.npy', x_train)\n",
"np.save('data/y_train.npy', y_train)\n",
"np.save('data/x_test.npy', x_test)\n",
"np.save('data/y_test.npy', y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (tf1.15)",
"language": "python",
"name": "tf1.15"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment