Presenting SOTA results on CIMA dataset¶
This notebook serves as visualisation for State-of-the-Art methods on CIMA dataset
Note: In case you want to get some further evaluation related to new submission, you may contact JB.
[1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys
import glob, json
import shutil
import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sys.path += [os.path.abspath('.'), os.path.abspath('..')] # Add path to root
from birl.utilities.data_io import update_path
from birl.utilities.evaluate import compute_ranking
from birl.utilities.drawing import RadarChart, draw_scatter_double_scale
from bm_ANHIR.generate_regist_pairs import VAL_STATUS_TRAIN, VAL_STATUS_TEST
from bm_ANHIR.evaluate_submission import COL_TISSUE
/home/jb/.local/lib/python3.6/site-packages/dask/config.py:161: YAMLLoadWarning: calling yaml.load() without Loader=... is deprecated, as the default Loader is unsafe. Please read https://msg.pyyaml.org/load for full details.
data = yaml.load(f.read()) or {}
This notebook serves for computing extended statistics (e.g. metrics inliding ranks) and visualie some more statistics.
You can run the notebook to see result on both scales 10k and full. To do so you need to unzip paticular archive in bm_CIMA to a separate folder and point-out this path as PATH_RESULTS bellow.
[2]:
# folder with all participants submissions
PATH_RESULTS = os.path.join(update_path('bm_CIMA'), 'size-10k')
# temporary folder for unzipping submissions
PATH_TEMP = os.path.abspath(os.path.expanduser('~/Desktop/CIMA_size-10k'))
# configuration needed for recomputing detail metrics
PATH_DATASET = os.path.join(update_path('bm_ANHIR'), 'dataset_ANHIR')
PATH_TABLE = os.path.join(update_path('bm_CIMA'), 'dataset_CIMA_10k.csv')
# landmarks provided to participants, in early ANHIR stage we provided only 20% points per image pair
PATH_LNDS_PROVIDED = os.path.join(PATH_DATASET, 'landmarks_all')
# complete landmarks dataset
PATH_LNDS_COMPLATE = os.path.join(PATH_DATASET, 'landmarks_all')
# baseline for normalization of computing time
PATH_COMP_BM = os.path.join(PATH_DATASET, 'computer-performances_cmpgrid-71.json')
FIELD_TISSUE = 'type-tissue'
# configuration for Pandas tables
pd.set_option("display.max_columns", 25)
assert os.path.isdir(PATH_TEMP)
Some initial replacement and name adjustments
[3]:
# simplify the metrics names according paper
METRIC_LUT = {'Average-': 'A', 'Rank-': 'R', 'Median-': 'M', 'Max-': 'S'}
def col_metric_rename(col):
for m in METRIC_LUT:
col = col.replace(m, METRIC_LUT[m])
return col
Parse and load submissions¶
Extract metrics from particular submissions¶
All submissions are expected to be as a zip archives in single folder. The archive name is the author name.
[4]:
# Find all archives and unzip them to the same folder.
archive_paths = sorted(glob.glob(os.path.join(PATH_RESULTS, '*.zip')))
submission_dirs = []
for path_zip in tqdm.tqdm(archive_paths, desc='unzipping'):
sub = os.path.join(PATH_TEMP, os.path.splitext(os.path.basename(path_zip))[0])
os.system('unzip -o "%s" -d "%s"' % (path_zip, sub))
sub_ins = glob.glob(os.path.join(sub, '*'))
# if the zip subfolder contain only one folder move it up
if len(sub_ins) == 1:
[shutil.move(p, sub) for p in glob.glob(os.path.join(sub_ins[0], '*'))]
submission_dirs.append(sub)
Parse submissions and compute the final metrics. This can be computed just once.
NOTE: you can skip this step if you have already computed metrics in JSON files
[5]:
import bm_ANHIR.evaluate_submission
bm_ANHIR.evaluate_submission.REQUIRE_OVERLAP_INIT_TARGET = False
tqdm_bar = tqdm.tqdm(total=len(submission_dirs))
for path_sub in submission_dirs:
tqdm_bar.set_description(path_sub)
# run the evaluation with details
path_json = bm_ANHIR.evaluate_submission.main(
path_experiment=path_sub, path_table=PATH_TABLE, path_dataset=PATH_LNDS_PROVIDED,
path_reference=PATH_LNDS_COMPLATE, path_comp_bm=PATH_COMP_BM, path_output=path_sub,
min_landmarks=1., details=True, allow_inverse=True)
# rename the metrics by the participant
shutil.copy(os.path.join(path_sub, 'metrics.json'),
os.path.join(PATH_RESULTS, os.path.basename(path_sub) + '.json'))
tqdm_bar.update()
Load parsed measures from each experiment¶
[4]:
submission_paths = sorted(glob.glob(os.path.join(PATH_RESULTS, '*.json')))
submissions = {}
# loading all participants metrics
for path_sub in tqdm.tqdm(submission_paths, desc='loading'):
with open(path_sub, 'r') as fp:
metrics = json.load(fp)
# rename tissue types accoding new LUT
for case in metrics['cases']:
metrics['cases'][case][FIELD_TISSUE] = metrics['cases'][case][FIELD_TISSUE]
m_agg = {stat: metrics['aggregates'][stat] for stat in metrics['aggregates']}
metrics['aggregates'] = m_agg
submissions[os.path.splitext(os.path.basename(path_sub))[0]] = metrics
print ('Users: %r' % submissions.keys())
Users: dict_keys(['ANTs', 'DROP', 'Elastix', 'RNiftyReg', 'RVSS', 'bUnwarpJ-SIFT', 'bUnwarpJ'])
[5]:
# split the particular fields inside the measured items
users = list(submissions.keys())
print ('Fields: %r' % submissions[users[0]].keys())
user_aggreg = {u: submissions[u]['aggregates'] for u in users}
user_computer = {u: submissions[u]['computer'] for u in users}
user_cases = {u: submissions[u]['cases'] for u in users}
print ('required-landmarks: %r' % [submissions[u]['required-landmarks'] for u in users])
tissues = set(user_cases[users[0]][cs][FIELD_TISSUE] for cs in user_cases[users[0]])
print ('found tissues: %r' % sorted(tissues))
Fields: dict_keys(['aggregates', 'cases', 'computer', 'submission-time', 'required-landmarks'])
required-landmarks: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
found tissues: ['lung-lesion', 'lung-lobes', 'mammary-gland']
Define colors and markers later used in charts
[6]:
METHODS = sorted(submissions.keys())
# https://seaborn.pydata.org/tutorial/color_palettes.html
# https://www.codecademy.com/articles/seaborn-design-ii
COLOR_PALETTE = "Set1"
METHOD_CMAP = sns.color_palette(COLOR_PALETTE, len(submissions))
METHOD_COLORS = {m: METHOD_CMAP[i] for i, m in enumerate(METHODS)}
def list_methods_colors(methods):
return [METHOD_COLORS[m] for m in methods]
def cmap_methods(method):
return METHOD_COLORS[m]
# define cyclic buffer of markers for methods
# https://matplotlib.org/3.1.1/api/markers_api.html
METHOD_MARKERS = dict(zip(submissions.keys(), list('.*^v<>pPhHXdD')))
# METHOD_MARKERS = dict(zip(submissions.keys(), list('.1234+xosD^v<>')))
def list_methods_markers(methods):
return [METHOD_MARKERS[m] for m in methods]
# display(pd.DataFrame([METHOD_COLORS, METHOD_MARKERS]).T)
Compute ranked measures¶
Extend the aggregated statistic by Rank measures such as compute ranking over all cases for each selected field and average it
[7]:
for field, field_agg in [('rTRE-Median', 'Median-rTRE'),
('rTRE-Max', 'Max-rTRE')]:
# Compute ranking per user in selected metric `field` over all dataset
user_cases = compute_ranking(user_cases, field)
for user in users:
# iterate over Robust or all cases
for robust in [True, False]:
# chose inly robyst if it is required
vals = [user_cases[user][cs][field + '_rank'] for cs in user_cases[user]
if (robust and user_cases[user][cs]['Robustness']) or (not robust)]
s_robust = '-Robust' if robust else ''
user_aggreg[user]['Average-Rank-' + field_agg + s_robust] = np.mean(vals)
user_aggreg[user]['STD-Rank-' + field_agg + s_robust] = np.std(vals)
# iterate over all tissue kinds
for tissue in tissues:
vals = [user_cases[user][cs][field + '_rank'] for cs in user_cases[user]
if user_cases[user][cs][FIELD_TISSUE] == tissue]
user_aggreg[user]['Average-Rank-' + field_agg + '__tissue_' + tissue + '__All'] = np.mean(vals)
user_aggreg[user]['STD-Rank-' + field_agg + '__tissue_' + tissue + '__All'] = np.std(vals)
Show the raw table with global statistic (joint training and testing/evaluation).
[8]:
cols_all = [col for col in pd.DataFrame(user_aggreg).T.columns
if not any(n in col for n in [VAL_STATUS_TRAIN, VAL_STATUS_TEST, '_tissue_', '_any'])]
cols_general = list(filter(lambda c: not c.endswith('-Robust'), cols_all))
dfx = pd.DataFrame(user_aggreg).T.sort_values('Average-Median-rTRE')[cols_general]
display(dfx)
# Exporting results to CSV
dfx.sort_index().to_csv(os.path.join(PATH_TEMP, 'results_overall.csv'))
| Average-used-landmarks | Average-Robustness | STD-Robustness | Median-Robustness | Average-Rank-Median-rTRE | Average-Rank-Max-rTRE | Average-Median-rTRE | STD-Median-rTRE | Median-Median-rTRE | Average-Max-rTRE | STD-Max-rTRE | Median-Max-rTRE | Average-Average-rTRE | STD-Average-rTRE | Median-Average-rTRE | Average-Norm-Time | STD-Norm-Time | Median-Norm-Time | STD-Rank-Median-rTRE | STD-Rank-Max-rTRE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ANTs | 1.0 | 0.790214 | 0.248232 | 0.892468 | 3.342593 | 2.824074 | 0.023031 | 0.020008 | 0.016662 | 0.055601 | 0.036617 | 0.050011 | 0.024131 | 0.019472 | 0.018600 | 52.167229 | 26.893003 | 60.916335 | 1.564431 | 1.489648 |
| DROP | 1.0 | 0.842451 | 0.304637 | 0.986842 | 2.296296 | 2.564815 | 0.025039 | 0.051129 | 0.005084 | 0.062894 | 0.079209 | 0.029880 | 0.026894 | 0.051019 | 0.006521 | 1.856635 | 0.844369 | 2.160855 | 2.042417 | 2.148527 |
| bUnwarpJ | 1.0 | 0.740683 | 0.277490 | 0.831250 | 3.601852 | 3.824074 | 0.028214 | 0.020833 | 0.030013 | 0.066614 | 0.036561 | 0.064172 | 0.029630 | 0.020194 | 0.031437 | 2.989572 | 1.125316 | 2.933392 | 1.855621 | 1.762935 |
| RNiftyReg | 1.0 | 0.680507 | 0.296854 | 0.702044 | 4.222222 | 4.064815 | 0.031981 | 0.019345 | 0.032205 | 0.068474 | 0.034884 | 0.068234 | 0.033036 | 0.019073 | 0.034479 | 0.364217 | 0.598157 | 0.045782 | 1.517389 | 1.760014 |
| Elastix | 1.0 | 0.795504 | 0.164782 | 0.807692 | 4.527778 | 5.000000 | 0.037900 | 0.028974 | 0.034742 | 0.082920 | 0.044742 | 0.075468 | 0.039706 | 0.028133 | 0.037397 | 4.024381 | 0.754311 | 4.254518 | 1.807691 | 1.563472 |
| RVSS | 1.0 | 0.574520 | 0.289345 | 0.541241 | 4.981481 | 4.462963 | 0.059515 | 0.158266 | 0.032394 | 0.094764 | 0.165354 | 0.063392 | 0.060600 | 0.157822 | 0.034269 | 1.499862 | 0.825750 | 1.229713 | 1.747917 | 1.734326 |
| bUnwarpJ-SIFT | 1.0 | 0.496952 | 0.383557 | 0.540145 | 5.027778 | 5.259259 | 0.074338 | 0.079285 | 0.042298 | 0.135266 | 0.122401 | 0.096839 | 0.077425 | 0.080656 | 0.046481 | 2.659866 | 1.165022 | 2.708990 | 1.863183 | 1.796964 |
Only robust metrics (computed over images pairs with robustness higher then a threshold)
[9]:
cols_robust = list(filter(lambda c: c.endswith('-Robust'), cols_all))
dfx = pd.DataFrame(user_aggreg).T.sort_values('Average-Median-rTRE')[cols_robust]
dfx.columns = list(map(lambda c: c.replace('-Robust', ''), dfx.columns))
display(dfx)
| Average-Median-rTRE | STD-Median-rTRE | Median-Median-rTRE | Average-Max-rTRE | STD-Max-rTRE | Median-Max-rTRE | Average-Average-rTRE | STD-Average-rTRE | Median-Average-rTRE | Average-Norm-Time | STD-Norm-Time | Median-Norm-Time | Average-Rank-Median-rTRE | STD-Rank-Median-rTRE | Average-Rank-Max-rTRE | STD-Rank-Max-rTRE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ANTs | 0.019158 | 0.017901 | 0.012060 | 0.051248 | 0.036556 | 0.041127 | 0.020362 | 0.017289 | 0.014229 | 50.306482 | 27.266031 | 55.401263 | 3.342593 | 1.564431 | 2.824074 | 1.489648 |
| DROP | 0.006222 | 0.007302 | 0.004133 | 0.035815 | 0.034673 | 0.024499 | 0.008198 | 0.008946 | 0.005886 | 1.926173 | 0.821015 | 2.271720 | 2.029412 | 1.768215 | 2.313725 | 1.935126 |
| bUnwarpJ | 0.024697 | 0.020917 | 0.021041 | 0.062631 | 0.038681 | 0.057320 | 0.026341 | 0.020400 | 0.023998 | 2.945704 | 1.038554 | 2.902659 | 3.601852 | 1.855621 | 3.824074 | 1.762935 |
| RNiftyReg | 0.029002 | 0.020507 | 0.030274 | 0.066526 | 0.038920 | 0.066252 | 0.030126 | 0.020228 | 0.032280 | 0.382616 | 0.612680 | 0.045494 | 4.188679 | 1.505419 | 4.028302 | 1.750782 |
| Elastix | 0.035819 | 0.028515 | 0.032164 | 0.081753 | 0.045233 | 0.071432 | 0.037716 | 0.027670 | 0.034383 | 4.007976 | 0.758227 | 3.978500 | 4.527778 | 1.807691 | 5.000000 | 1.563472 |
| RVSS | 0.029296 | 0.024597 | 0.021571 | 0.064435 | 0.045133 | 0.050208 | 0.030293 | 0.024747 | 0.023583 | 1.456545 | 0.747360 | 1.343542 | 4.913462 | 1.743728 | 4.375000 | 1.705125 |
| bUnwarpJ-SIFT | 0.032093 | 0.018648 | 0.034173 | 0.076951 | 0.034319 | 0.073679 | 0.034356 | 0.017777 | 0.034625 | 2.896436 | 1.167257 | 2.824024 | 4.505882 | 1.766509 | 4.800000 | 1.761016 |
Define color and markers per method which shall be used later…
[10]:
col_ranking = 'Average-Rank-Median-rTRE'
dfx = pd.DataFrame(user_aggreg).T.sort_values(col_ranking)
# display(dfx[[col_ranking]])
users_ranked = dfx.index
print('Odered methods by "%s": %s' % (col_ranking, list(users_ranked)))
Odered methods by "Average-Rank-Median-rTRE": ['DROP', 'ANTs', 'bUnwarpJ', 'RNiftyReg', 'Elastix', 'RVSS', 'bUnwarpJ-SIFT']
Basic visualizations¶
Show general results in a chart…
[11]:
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[list(filter(lambda c: not c.startswith('STD-'), cols_general))]
ax = dfx.T.plot.bar(figsize=(len(cols_general) * 0.7, 4), grid=True, logy=True, rot=75, color=list_methods_colors(dfx.index))
# ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.35),
# ncol=int(len(users) / 1.5), fancybox=True, shadow=True)
ax.legend(bbox_to_anchor=(1.1, 0.95))
ax.get_figure().tight_layout()
ax.get_figure().savefig(os.path.join(PATH_TEMP, 'bars_teams-scores.pdf'))
# plt.savefig(os.path.join(PATH_TEMP, 'fig_teams-scores.pdf'), constrained_layout=True)
[12]:
for col, name in [('Average-Rank-Median-rTRE', 'ARMrTRE'),
('Average-Median-rTRE', 'AMrTRE'),
('Median-Median-rTRE', 'MMrTRE')]:
plt.figure(figsize=(4, 2.5))
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[col].sort_values()
ax = dfx.plot.bar(grid=True, rot=40, color=list_methods_colors(dfx.index))
# ax = pd.DataFrame(user_aggreg).T.sort_values(col)[col].plot.bar(grid=True, rot=90, color='blue')
_= plt.ylabel(name)
ax.get_figure().tight_layout()
ax.get_figure().savefig(os.path.join(PATH_TEMP, 'bar_teams-scores_%s.pdf' % col))
Transform the case format data to be simple form with extra colums for used and case ID to be able to draw a violine plot later.
[13]:
dfs_ = []
for usr in users:
df = pd.DataFrame(user_cases[usr]).T
df['method'] = usr
df['case'] = df.index
dfs_.append(df)
df_cases = pd.concat(dfs_).reset_index()
del dfs_
for col in df_cases.columns:
try:
df_cases[col] = pd.to_numeric(df_cases[col])
except Exception:
print('skip not numerical column: "%s"' % col)
# df_cases.head()
skip not numerical column: "name-tissue"
skip not numerical column: "type-tissue"
skip not numerical column: "name-reference"
skip not numerical column: "name-source"
skip not numerical column: "method"
Showing several distribution plots¶
[14]:
def _format_ax(ax, name, use_log=False, vmax=None):
plt.xticks(rotation=60)
if use_log:
ax.set_yscale('log')
if vmax:
ax.set_ylim([0, vmax])
ax.grid(True)
ax.set_xlabel('')
ax.set_ylabel(name)
ax.get_figure().tight_layout()
show_metrics = [('rTRE-Median', 'MrTRE', True, None, 0.01),
('rTRE-Max', 'SrTRE', True, None, 0.01),
('Robustness', 'Robust.', False, None, 0.05),
('Norm-Time_minutes', 'Time [min]', True, 180, 0.1)]
[15]:
for field, name, log, vmax, bw in show_metrics:
# methods_ = list(dfg['method'].unique())
vals_ = [df_cases[df_cases['method'] == m][field].values for m in users_ranked]
df_ = pd.DataFrame(np.array(vals_).T, columns=users_ranked)
fig, ax = plt.subplots(figsize=(5, 3))
bp = df_.plot.box(ax=ax, showfliers=True, showmeans=True,
color=dict(boxes='b', whiskers='b', medians='g', caps='k'),
boxprops=dict(linestyle='-', linewidth=1),
flierprops=dict(linestyle='-', linewidth=1),
medianprops=dict(linestyle='-', linewidth=1),
whiskerprops=dict(linestyle='-.', linewidth=1),
capprops=dict(linestyle='-', linewidth=1),
return_type='dict')
_format_ax(ax, name, log, vmax)
ax.get_figure().savefig(os.path.join(PATH_TEMP, 'boxbar_teams-scores_%s.pdf' % field))
/home/jb/.local/lib/python3.6/site-packages/matplotlib/axes/_base.py:3477: UserWarning: Attempted to set non-positive ylimits for log-scale axis; invalid limits will be ignored.
'Attempted to set non-positive ylimits for log-scale axis; '
[16]:
for field, name, log, vmax, bw in show_metrics:
# methods_ = list(dfg['method'].unique())
vals_ = [df_cases[df_cases['method'] == m][field].values for m in users_ranked]
df_ = pd.DataFrame(np.array(vals_).T, columns=users_ranked)
fig = plt.figure(figsize=(5, 3))
# clr = sns.palplot(sns.color_palette(tuple(list_methods_colors(df_.columns))))
ax = sns.violinplot(ax=plt.gca(), data=df_, inner="quartile", trim=True, cut=0., palette=COLOR_PALETTE, linewidth=1.)
_format_ax(fig.gca(), name, log, vmax)
fig.gca().grid(True)
fig.savefig(os.path.join(PATH_TEMP, 'violin_teams-scores_%s.pdf' % field))
/home/jb/.local/lib/python3.6/site-packages/matplotlib/axes/_base.py:3477: UserWarning: Attempted to set non-positive ylimits for log-scale axis; invalid limits will be ignored.
'Attempted to set non-positive ylimits for log-scale axis; '
Visualise global results¶
[17]:
fields = ['Average-Max-rTRE', # 'Average-Max-rTRE-Robust',
'Average-Median-rTRE', # 'Average-Median-rTRE-Robust',
'Median-Median-rTRE', # 'Median-Median-rTRE-Robust',
# 'Average-Rank-Max-rTRE', 'Average-Rank-Median-rTRE',
'Average-Norm-Time', # 'Average-Norm-Time-Robust',
'Average-Robustness',]
df = pd.DataFrame(user_aggreg)[users_ranked].T[fields]
df['Average-Weakness'] = 1 - df['Average-Robustness']
del df['Average-Robustness']
radar = RadarChart(df, fig=plt.figure(figsize=(5, 4)), colors=list_methods_colors(df.index), fill_alpha=0.02)
lgd = radar.ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.15), ncol=int(len(users) / 1.5))
radar.fig.savefig(os.path.join(PATH_TEMP, 'radar_teams-scores.pdf'),
bbox_extra_artists=radar._labels + [lgd], bbox_inches='tight')
Visual statistic over tissue types¶
Present some statistis depending on the tissue types…
[18]:
cols_all = pd.DataFrame(user_aggreg).T.columns
col_avg_med_tissue = sorted(filter(
lambda c: 'Median-rTRE_tissue' in c and not 'Rank' in c and 'Median-Median-' not in c, cols_all))
col_robust_tissue = sorted(filter(
lambda c: 'Average-Robustness_tissue' in c and not 'Rank' in c, cols_all))
[19]:
params_tuple = [
(col_avg_med_tissue, 'Avg. Median rTRE', 'Average-Median-rTRE__tissue_{}__All', True),
(col_robust_tissue,'Avg. Robust', 'Average-Robustness__tissue_{}__All', False),
]
for cols, desc, drop, use_log in params_tuple:
# print('"%s" with sample columns: %s' % (desc, cols[:3]))
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols]
# colors = plt.get_cmap('nipy_spectral', len(dfx))
fig, extras = draw_scatter_double_scale(
dfx, colors=list_methods_colors(users_ranked), ax_decs={desc: None},
idx_markers=list_methods_markers(users_ranked), xlabel='Methods', figsize=(2 + len(dfx.columns) * 0.95, 3),
# legend_style=dict(bbox_to_anchor=(0.5, 1.15), ncol=4),
legend_style=dict(bbox_to_anchor=(1.15, 0.95), ncol=1),
x_spread=(0.4, 5))
# DEPRICATED visualisation
# ax = dfx.T.plot(style='X', cmap=plt.get_cmap('nipy_spectral', len(dfx)), figsize=(len(dfx) / 2 + 1, 5), grid=True)
# ax.legend(loc='upper center', bbox_to_anchor=(1.2, 1.0), ncol=1)
extras['ax1'].set_xticks(range(len(cols)))
extras['ax1'].set_xticklabels(list(map(lambda c: col_metric_rename(c.replace(drop, '')), cols)), rotation=45, ha="center")
_format_ax(extras['ax1'], desc, use_log, vmax=None)
name = ''.join(filter(lambda s: s not in '(.)', desc)).replace(' ', '-')
fig.savefig(os.path.join(PATH_TEMP, 'scat_teams-scores_tissue-%s.pdf' % name),
bbox_extra_artists=(extras['legend'],), bbox_inches='tight') #
[ ]: