ANHIR: Evaluation Board

This notebook serves as visualisation for challenge results in several forms - standard tables and charts. First, all submissions have to be recomputed, so they also contain the detail about each image registration pair. One of the perspective to validate is the difference between training and testing.

The ANHIR challenge is hosted on https://anhir.grand-challenge.org.

These particular results are for ANHIR workshop hosted at ISBI 2019 in Venice, Italy. In case you want to get some further evaluation related to new submission, you may contact one of the ANHIR administrators because the full submission is required, see https://anhir.grand-challenge.org/Organizers.

[1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os, sys
import glob, json
import shutil

import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path += [os.path.abspath('.'), os.path.abspath('..')]  # Add path to root
from birl.utilities.data_io import update_path
from birl.utilities.evaluate import compute_ranking, grouping_cumulative
from birl.utilities.drawing import RadarChart, draw_matrix_user_ranking, draw_scatter_double_scale
from birl.benchmark import ImRegBenchmark
from bm_ANHIR.generate_regist_pairs import VAL_STATUS_TRAIN, VAL_STATUS_TEST
from bm_ANHIR.evaluate_submission import COL_TISSUE

This notebook serves for computing extended statistics (e.g. metrics inliding ranks) and visualie some more statistics.

[2]:
# folder with all participants submissions
PATH_SUBMISSIONS = os.path.join(update_path('bm_ANHIR'), 'submissions_paper')
# temporary folder for unzipping submissions
PATH_TEMP = os.path.abspath(os.path.expanduser('~/Desktop/ANHIR_submissions'))
# configuration needed for recomputing detail metrics
PATH_DATASET = os.path.join(update_path('bm_ANHIR'), 'dataset_ANHIR')
PATH_TABLE = os.path.join(PATH_DATASET, 'dataset_medium.csv')
# landmarks provided to participants, in early ANHIR stage we provided only 20% points per image pair
PATH_LNDS_PROVIDED = os.path.join(PATH_DATASET, 'landmarks_all')
# complete landmarks dataset
PATH_LNDS_COMPLATE = os.path.join(PATH_DATASET, 'landmarks_all')
# baseline for normalization of computing time
PATH_COMP_BM = os.path.join(PATH_DATASET, 'computer-performances_cmpgrid-71.json')
FIELD_TISSUE = 'type-tissue'
FIELD_STATUS = 'Status'

# configuration for Pandas tables
pd.set_option("display.max_columns", 25)

Some initial replacement and name adjustments

[3]:
# Look-Up-Table for upadting plot names from original (dataset) names
TISSUE_LUT = {
    'COAD': 'COAD', 'gastric': 'gastric',
    'breast': 'human-breast', 'kidney': 'human-kidney',
    'lung-lesion': 'lung-lesion', 'lung-lobes': 'lung-lobes',
    'mammary-gland': 'mammary-gland', 'mice-kidney': 'mouse-kidney'
}
# simplify the metrics names according paper
METRIC_LUT = {
    'Average-': 'A',
    'Rank-': 'R',
    'Median-': 'M',
    'Max-': 'Mx',
}

def col_metric_rename(col):
    for m in METRIC_LUT:
        col = col.replace(m, METRIC_LUT[m])
    return col

def col_tissue_remane(col):
    k = col.split('_')[-1]
    if k in TISSUE_LUT:
        col = col.replace(k, TISSUE_LUT[k])
    return col

Parse and load submissions

Extract metrics from particular submissions

All submissions are expected to be as a zip archives in single folder. The archive name is the author name.

[4]:
# Find all archives and unzip them to the same folder.
archive_paths = sorted(glob.glob(os.path.join(PATH_SUBMISSIONS, '*.zip')))
submission_dirs = []

for path_zip in tqdm.tqdm(archive_paths, desc='unzipping'):
    sub = os.path.join(PATH_TEMP, os.path.splitext(os.path.basename(path_zip))[0])
    os.system('unzip -o "%s" -d "%s"' % (path_zip, sub))
    sub_ins = glob.glob(os.path.join(sub, '*'))
    # if the zip subfolder contain only one folder move it up
    if len(sub_ins) == 1:
        [shutil.move(p, sub) for p in glob.glob(os.path.join(sub_ins[0], '*'))]
    submission_dirs.append(sub)

Parse submissions and compute the final metrics. This can be computed just once.

NOTE: you can skip this step if you have already computed metrics in JSON files

[5]:
import bm_ANHIR.evaluate_submission

bm_ANHIR.evaluate_submission.REQUIRE_OVERLAP_INIT_TARGET = False

tqdm_bar = tqdm.tqdm(total=len(submission_dirs))
for path_sub in submission_dirs:
    tqdm_bar.set_description(path_sub)
    # run the evaluation with details
    path_json = bm_ANHIR.evaluate_submission.main(
        path_experiment=path_sub,
        path_table=PATH_TABLE,
        path_dataset=PATH_LNDS_PROVIDED,
        path_reference=PATH_LNDS_COMPLATE,
        path_comp_bm=PATH_COMP_BM,
        path_output=path_sub,
        min_landmarks=1.,
        details=True,
        allow_inverse=True)
    # rename the metrics by the participant
    shutil.copy(os.path.join(path_sub, 'metrics.json'),
                os.path.join(PATH_SUBMISSIONS, os.path.basename(path_sub) + '.json'))
    tqdm_bar.update()

Load parsed measures from each submission/experiment/participant

[6]:
submission_paths = sorted(glob.glob(os.path.join(PATH_SUBMISSIONS, '*.json')))
submissions = {}
# loading all participants metrics
for path_sub in tqdm.tqdm(submission_paths, desc='loading'):
    with open(path_sub, 'r') as fp:
        metrics = json.load(fp)
    # rename tissue types accoding new LUT
    for case in metrics['cases']:
        metrics['cases'][case][FIELD_TISSUE] = TISSUE_LUT[metrics['cases'][case][FIELD_TISSUE]]
    m_agg = {col_tissue_remane(stat): metrics['aggregates'][stat] for stat in metrics['aggregates']}
    metrics['aggregates'] = m_agg
    submissions[os.path.splitext(os.path.basename(path_sub))[0]] = metrics

print ('Users: %r' % submissions.keys())
Users: dict_keys(['AGH', 'ANTs', 'CKVST', 'DROP', 'Elastix', 'MEVIS', 'NiftyReg', 'RVSS', 'TUB', 'TUNI', 'UA', 'UPENN', 'bUnwarpJ'])
[7]:
# split the particular fields inside the measured items
users = list(submissions.keys())
print ('Fields: %r' % submissions[users[0]].keys())
user_aggreg = {u: submissions[u]['aggregates'] for u in users}
user_computer = {u: submissions[u]['computer'] for u in users}
user_cases = {u: submissions[u]['cases'] for u in users}
print ('required-landmarks: %r' % [submissions[u]['required-landmarks'] for u in users])
tissues = set(user_cases[users[0]][cs][FIELD_TISSUE] for cs in user_cases[users[0]])
print ('found tissues: %r' % sorted(tissues))
Fields: dict_keys(['aggregates', 'cases', 'computer', 'submission-time', 'required-landmarks'])
required-landmarks: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
found tissues: ['COAD', 'gastric', 'human-breast', 'human-kidney', 'lung-lesion', 'lung-lobes', 'mammary-gland', 'mouse-kidney']

Present simple statistic about ratios of train/test image pairs per dataset…

[8]:
# add stat of train/test for tissie
df_tissue_stat = pd.DataFrame([(cs['name-tissue'], cs[FIELD_TISSUE], cs[FIELD_STATUS])
                               for cs in user_cases[users[0]].values()],
                              columns=['name', FIELD_TISSUE, FIELD_STATUS])
df_tissue_stat.set_index('name', inplace=True)

# compute cumulative stat
dfx = grouping_cumulative(df_tissue_stat, col_index=FIELD_TISSUE, col_column=FIELD_STATUS)
display(dfx.T)

# show results
# ax = dfx.plot(kind='bar', stacked=True, figsize=(len(dfx) / 2, 3), grid=True, cmap='RdYlGn')
# _= plt.ylabel('# regist. pairs')
# ax.get_figure().savefig(os.path.join(PATH_TEMP, 'bars_tissue-status_abs.pdf'))

# normalize over the sum of all cases
dfx = (dfx.T / dfx.T.sum().astype(float)).T
ax = dfx.plot(kind='bar', stacked=True, figsize=(len(dfx) / 2, 2), grid=True, cmap='RdYlGn')
_= plt.ylabel('% regist. pairs')
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25), ncol=2, fancybox=True, shadow=True)
ax.get_figure().savefig(os.path.join(PATH_TEMP, 'bars_tissue-status_rltv.pdf'))
type-tissue COAD gastric human-breast human-kidney lung-lesion lung-lobes mammary-gland mouse-kidney
evaluation 153.0 40.0 20.0 20.0 NaN NaN NaN 18.0
training 84.0 13.0 5.0 5.0 30.0 40.0 38.0 15.0
../_images/notebooks_ANHIR_evaluation-board_15_1.png

Define colors and markers later used in charts

[9]:
METHODS = sorted(submissions.keys())
METHOD_CMAP = plt.get_cmap('nipy_spectral', len(submissions))
METHOD_COLORS = {m: METHOD_CMAP(i) for i, m in enumerate(METHODS)}

def list_methods_colors(methods):
    return [METHOD_COLORS[m] for m in methods]

def cmap_methods(method):
    return METHOD_COLORS[m]

# define cyclic buffer of markers for methods
# https://matplotlib.org/3.1.1/api/markers_api.html
METHOD_MARKERS = dict(zip(submissions.keys(), list('.*^v<>pPhHXdD')))
# METHOD_MARKERS = dict(zip(submissions.keys(), list('.1234+xosD^v<>')))

def list_methods_markers(methods):
    return [METHOD_MARKERS[m] for m in methods]

display(pd.DataFrame([METHOD_COLORS, METHOD_MARKERS]).T)
0 1
AGH (0.0, 0.0, 0.0, 1.0) .
ANTs (0.5111, 0.0, 0.577766666667, 1.0) *
CKVST (0.0, 0.0, 0.733366666667, 1.0) ^
DROP (0.0, 0.4667, 0.8667, 1.0) v
Elastix (0.0, 0.644466666667, 0.733366666667, 1.0) <
MEVIS (0.0, 0.644466666667, 0.355533333333, 1.0) >
NiftyReg (0.0, 0.7333, 0.0, 1.0) p
RVSS (0.0, 0.955566666667, 0.0, 1.0) P
TUB (0.799966666667, 0.977766666667, 0.0, 1.0) h
TUNI (1.0, 0.8, 0.0, 1.0) H
UA (1.0, 0.2, 0.0, 1.0) X
UPENN (0.844466666667, 0.0, 0.0, 1.0) d
bUnwarpJ (0.8, 0.8, 0.8, 1.0) D

Compute ranked measures

Extend the aggregated statistic by Rank measures such as compute ranking over all cases for each selected field and average it

[10]:
for field, field_agg in [('rTRE-Median', 'Median-rTRE'),
                         ('rTRE-Max', 'Max-rTRE')]:
    # Compute ranking per user in selected metric `field` over all dataset
    user_cases = compute_ranking(user_cases, field)
    for user in users:
        # iterate over Robust or all cases
        for robust in [True, False]:
            # chose inly robyst if it is required
            vals = [user_cases[user][cs][field + '_rank'] for cs in user_cases[user]
                    if (robust and user_cases[user][cs]['Robustness']) or (not robust)]
            s_robust = '-Robust' if robust else ''
            user_aggreg[user]['Average-Rank-' + field_agg + s_robust] = np.mean(vals)
        # iterate over all tissue kinds
        for tissue in tissues:
            vals = [user_cases[user][cs][field + '_rank'] for cs in user_cases[user]
                    if user_cases[user][cs][FIELD_TISSUE] == tissue]
            user_aggreg[user]['Average-Rank-' + field_agg + '_tissue_' + tissue] = np.mean(vals)

Show the raw table with global statiscit (joint training and testing/evalaution).

[11]:
cols_general = [col for col in pd.DataFrame(user_aggreg).T.columns
                if not any(n in col for n in [VAL_STATUS_TRAIN, VAL_STATUS_TEST, '_tissue_'])  and not col.endswith('-Robust')]
dfx = pd.DataFrame(user_aggreg).T.sort_values('Average-Median-rTRE')[cols_general]
display(dfx)
# Exporting results to CSV
dfx.sort_values('Average-Median-rTRE').to_csv(os.path.join(PATH_TEMP, 'results_overall.csv'))
Average-Average-rTRE Average-Max-rTRE Average-Median-rTRE Average-Norm-Time Average-Rank-Max-rTRE Average-Rank-Median-rTRE Average-Robustness Average-used-landmarks Median-Average-rTRE Median-Max-rTRE Median-Median-rTRE Median-Norm-Time Median-Robustness
UPENN 0.004057 0.023043 0.002790 1.451193 4.234927 3.403326 0.988797 1.0 0.003009 0.017465 0.001918 1.374846 1.000000
AGH 0.005636 0.030005 0.003804 6.863679 6.226611 3.567568 0.977013 1.0 0.003395 0.023059 0.001973 5.791019 1.000000
MEVIS 0.005191 0.026069 0.003852 0.145392 4.825364 2.979210 0.984455 1.0 0.002889 0.018566 0.001815 0.141754 1.000000
TUB 0.004731 0.014927 0.004099 0.000705 2.465696 2.844075 0.991911 1.0 0.001203 0.004634 0.001021 0.000510 1.000000
CKVST 0.006044 0.026128 0.004609 7.127142 5.825364 6.280665 0.973015 1.0 0.004652 0.020775 0.003314 7.488842 1.000000
TUNI 0.010363 0.038723 0.008724 10.320549 6.370062 5.993763 0.889920 1.0 0.003719 0.023399 0.002501 8.977965 1.000000
RVSS 0.047089 0.103180 0.045024 4.723187 8.523909 9.640333 0.795752 1.0 0.007098 0.029411 0.005490 4.881740 0.987500
UA 0.056887 0.119045 0.054878 1.470925 8.912682 10.137214 0.807619 1.0 0.010979 0.035972 0.009043 1.367489 0.973684
DROP 0.061602 0.122958 0.061336 3.406355 7.288981 6.869023 0.886052 1.0 0.004340 0.026460 0.002796 3.388050 0.990654
Elastix 0.069476 0.137054 0.068433 2.962337 8.798337 9.831601 0.766790 1.0 0.007977 0.038993 0.005409 2.682059 0.970588
ANTs 0.069322 0.134296 0.068621 43.092353 7.839917 9.037422 0.813717 1.0 0.008654 0.035855 0.006747 41.008271 0.971831
bUnwarpJ 0.079704 0.149613 0.079557 9.151172 9.571726 9.648649 0.793992 1.0 0.025579 0.065169 0.024590 7.469397 0.931034
NiftyReg 0.082488 0.151446 0.082781 0.151164 10.116424 10.767152 0.749487 1.0 0.034631 0.067933 0.032717 0.052888 0.851852

Only robust metrics (coputed over images pairs with fobustnes higher then a threshold)

[12]:
cols_robust = [col for col in pd.DataFrame(user_aggreg).T.columns
                if not any(n in col for n in [VAL_STATUS_TRAIN, VAL_STATUS_TEST, '_tissue_']) and col.endswith('-Robust')]
dfx = pd.DataFrame(user_aggreg).T.sort_values('Average-Median-rTRE')[cols_robust]
dfx.columns = list(map(lambda c: c.replace('-Robust', ''), dfx.columns))
display(dfx)
Average-Average-rTRE Average-Max-rTRE Average-Median-rTRE Average-Norm-Time Average-Rank-Max-rTRE Average-Rank-Median-rTRE Median-Average-rTRE Median-Max-rTRE Median-Median-rTRE Median-Norm-Time
UPENN 0.004057 0.023043 0.002790 1.451193 4.234927 3.403326 0.003009 0.017465 0.001918 1.374846
AGH 0.005610 0.029932 0.003777 6.860032 6.226611 3.567568 0.003381 0.023048 0.001968 5.787261
MEVIS 0.004320 0.024423 0.002923 0.145501 4.825364 2.979210 0.002888 0.018537 0.001811 0.141826
TUB 0.003539 0.013330 0.002890 0.000706 2.445833 2.822917 0.001169 0.004472 0.001016 0.000510
CKVST 0.005891 0.025866 0.004451 7.146332 5.825364 6.280665 0.004634 0.020768 0.003313 7.503305
TUNI 0.007183 0.033379 0.005647 9.131541 5.810069 5.395881 0.003350 0.020732 0.002225 8.767961
RVSS 0.012546 0.042371 0.010725 5.072646 8.033254 9.289786 0.005474 0.024088 0.004165 5.169940
UA 0.016040 0.045178 0.014523 1.464565 8.721133 10.002179 0.008984 0.031842 0.007342 1.372182
DROP 0.032903 0.076700 0.032100 3.465610 7.172340 6.748936 0.003880 0.023597 0.002558 3.415281
Elastix 0.042368 0.098012 0.040490 3.239014 8.704741 9.778017 0.005424 0.028498 0.004064 2.919051
ANTs 0.046432 0.098135 0.045789 40.351943 7.583519 8.868597 0.006468 0.029296 0.005023 39.914611
bUnwarpJ 0.056684 0.115475 0.055782 9.269456 9.560669 9.640167 0.017620 0.050233 0.016497 7.501626
NiftyReg 0.065827 0.126228 0.065719 0.158154 10.076433 10.736730 0.027914 0.058758 0.027238 0.051688

Define coloer and markers per method which shall be used later…

[13]:
col_ranking = 'Average-Rank-Median-rTRE'
dfx = pd.DataFrame(user_aggreg).T.sort_values(col_ranking)
# display(dfx[[col_ranking]])
users_ranked = dfx.index
print('Odered methods by "%s": %s' % (col_ranking, list(users_ranked)))
Odered methods by "Average-Rank-Median-rTRE": ['TUB', 'MEVIS', 'UPENN', 'AGH', 'TUNI', 'CKVST', 'DROP', 'ANTs', 'RVSS', 'bUnwarpJ', 'Elastix', 'UA', 'NiftyReg']

Show general results in a chart…

[14]:
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols_general]
ax = dfx.T.plot.bar(figsize=(len(cols_general) * 0.9, 6), grid=True, logy=True, rot=75, cmap=plt.cm.tab20)
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25),
          ncol=int(len(users) / 1.8), fancybox=True, shadow=True)
ax.get_figure().tight_layout()
ax.get_figure().savefig(os.path.join(PATH_TEMP, 'bars_teams-scores.pdf'))
# plt.savefig(os.path.join(PATH_TEMP, 'fig_teams-scores.pdf'), constrained_layout=True)
../_images/notebooks_ANHIR_evaluation-board_28_0.png

Compare statistic “train” vs “test”

Compute statistics on training and testing case also foor newly add rank measures.

[15]:
for field, field_agg in [# ('rTRE-Median', 'Median-rTRE'),
                         ('rTRE-Median_rank', 'Rank-Median-rTRE'),
                         # ('rTRE-Max', 'Max-rTRE'),
                         ('rTRE-Max_rank', 'Rank-Max-rTRE'),
                         # ('Norm-Time_minutes', 'Norm-Time'),
                        ]:
    for user in users:
        for state in [VAL_STATUS_TRAIN, VAL_STATUS_TEST]:
            # filter values per user, metric and state
            vals = [user_cases[user][cs][field] for cs in user_cases[user]
                    if user_cases[user][cs]['Status'] == state]
            user_aggreg[user]['Average-' + field_agg + '_' + state] = np.nanmean(vals)

With given methods order them by particular criterion… Test ARMrTRE

[16]:
col_ranking = 'Average-Rank-Median-rTRE_' + VAL_STATUS_TEST
dfx = pd.DataFrame(user_aggreg).T.sort_values(col_ranking)
# display(dfx[[col_ranking]])
users_ranked = dfx.index
print('Odered methods by "%s": \n%s' % (col_ranking, list(users_ranked)))
Odered methods by "Average-Rank-Median-rTRE_evaluation":
['MEVIS', 'AGH', 'UPENN', 'CKVST', 'TUB', 'TUNI', 'DROP', 'ANTs', 'RVSS', 'bUnwarpJ', 'Elastix', 'UA', 'NiftyReg']
[17]:
df = pd.DataFrame(user_aggreg).T
cols_train_test = [col for col in df.columns
                   if any(st in col for st in [VAL_STATUS_TRAIN, VAL_STATUS_TEST]) and '_tissue_' not in col]
print (cols_train_test)
['Average-Average-rTRE_evaluation', 'Average-Average-rTRE_training', 'Average-Max-rTRE_evaluation', 'Average-Max-rTRE_training', 'Average-Median-rTRE_evaluation', 'Average-Median-rTRE_training', 'Average-Rank-Max-rTRE_evaluation', 'Average-Rank-Max-rTRE_training', 'Average-Rank-Median-rTRE_evaluation', 'Average-Rank-Median-rTRE_training', 'Average-Robustness_evaluation', 'Average-Robustness_training', 'Median-Average-rTRE_evaluation', 'Median-Average-rTRE_training', 'Median-Max-rTRE_evaluation', 'Median-Max-rTRE_training', 'Median-Median-rTRE_evaluation', 'Median-Median-rTRE_training', 'Median-Robustness_evaluation', 'Median-Robustness_training']

Show separatly raw table with statistic for train and test cases separatly.

[18]:
from IPython.display import display

df_train_eval = pd.DataFrame()

for state in [VAL_STATUS_TRAIN, VAL_STATUS_TEST]:
    cols = [col for col in cols_train_test if state in col]
    df = pd.DataFrame(user_aggreg).T[cols]
    df.columns = list(map(lambda c: c.replace('_' + state, ''), df.columns))
    df['state'] = state
    df_train_eval = pd.concat([df_train_eval, df])

# df = pd.DataFrame(user_aggreg).T[cols]
# cols = [c for c in df_all_train_eval.columns if 'state' not in c and c in df.columns]
# df = df[cols]
# df['state'] = 'all'
# df_all_train_eval = pd.concat([df_all_train_eval, df])

df_train_eval['method'] = df_train_eval.index
df_train_eval = df_train_eval.set_index(['method', 'state']).sort_index()
df_train_eval.to_csv(os.path.join(PATH_TEMP, 'results-methods_tran-test.csv'))
display(df_train_eval)
Average-Average-rTRE Average-Max-rTRE Average-Median-rTRE Average-Rank-Max-rTRE Average-Rank-Median-rTRE Average-Robustness Median-Average-rTRE Median-Max-rTRE Median-Median-rTRE Median-Robustness
method state
AGH evaluation 0.005283 0.028339 0.003587 6.000000 3.422311 0.982096 0.003220 0.022489 0.001913 1.000000
training 0.006021 0.031823 0.004042 6.473913 3.726087 0.971467 0.003656 0.023500 0.002023 1.000000
ANTs evaluation 0.099126 0.186060 0.099164 7.788845 9.231076 0.788870 0.007222 0.035078 0.005799 0.971429
training 0.036797 0.077806 0.035289 7.895652 8.826087 0.840831 0.009663 0.037072 0.007837 0.974192
CKVST evaluation 0.004294 0.023934 0.002693 5.274900 4.406375 0.988340 0.003234 0.018857 0.002258 1.000000
training 0.007955 0.028522 0.006701 6.426087 8.326087 0.956290 0.006027 0.022732 0.005048 1.000000
DROP evaluation 0.086123 0.164415 0.086654 7.430279 7.055777 0.882477 0.004216 0.027342 0.002780 0.989247
training 0.034842 0.077716 0.033706 7.134783 6.665217 0.889953 0.004641 0.025512 0.002800 0.996403
Elastix evaluation 0.096407 0.185674 0.095648 8.876494 10.043825 0.847687 0.007354 0.035312 0.005397 0.972222
training 0.040086 0.083994 0.038734 8.713043 9.600000 0.678507 0.008457 0.041956 0.005506 0.959184
MEVIS evaluation 0.004361 0.025107 0.002870 5.035857 2.840637 0.988025 0.002740 0.018827 0.001790 1.000000
training 0.006096 0.027120 0.004924 4.595652 3.130435 0.980559 0.002964 0.018286 0.001857 1.000000
NiftyReg evaluation 0.111982 0.201039 0.113601 10.083665 11.075697 0.742689 0.037248 0.071369 0.035508 0.851852
training 0.050301 0.097324 0.049148 10.152174 10.430435 0.756907 0.033469 0.067267 0.031194 0.850657
RVSS evaluation 0.047214 0.104800 0.044767 8.418327 9.649402 0.815460 0.006302 0.027516 0.004587 0.992806
training 0.046953 0.101412 0.045304 8.639130 9.630435 0.774246 0.008092 0.031966 0.006542 0.985612
TUB evaluation 0.008945 0.028027 0.007753 3.808765 4.533865 0.984536 0.002887 0.017783 0.002070 1.000000
training 0.000132 0.000630 0.000111 1.000000 1.000000 0.999959 0.000124 0.000450 0.000106 1.000000
TUNI evaluation 0.006395 0.028668 0.004779 5.796813 5.322709 0.982278 0.003128 0.020421 0.002059 1.000000
training 0.014693 0.049695 0.013028 6.995652 6.726087 0.789128 0.005126 0.027064 0.003106 1.000000
UA evaluation 0.053601 0.112441 0.050650 8.828685 10.278884 0.820945 0.010020 0.035309 0.008247 0.985294
training 0.060474 0.126252 0.059492 9.004348 9.982609 0.793076 0.011664 0.037880 0.009985 0.956695
UPENN evaluation 0.004198 0.023865 0.002858 4.290837 3.466135 0.989808 0.002897 0.019016 0.001899 1.000000
training 0.003902 0.022146 0.002717 4.173913 3.334783 0.987693 0.003058 0.016792 0.001948 1.000000
bUnwarpJ evaluation 0.109715 0.199536 0.110519 9.366534 9.673307 0.789940 0.028998 0.072711 0.025988 0.931034
training 0.046954 0.095131 0.045769 9.795652 9.621739 0.798414 0.024586 0.060100 0.023791 0.928930
[19]:
for col, name in [('Average-Rank-Median-rTRE_evaluation', 'ARMrTRE (eval.)'),
                  ('Average-Median-rTRE_evaluation', 'AMrTRE (eval.)'),
                  ('Median-Median-rTRE_evaluation', 'MMrTRE (eval.)')]:
    plt.figure(figsize=(4, 3))
    dfx = pd.DataFrame(user_aggreg)[users_ranked].T[col].sort_values()
    ax = dfx.plot.bar(grid=True, rot=90, color=list_methods_colors(dfx.index))
    # ax = pd.DataFrame(user_aggreg).T.sort_values(col)[col].plot.bar(grid=True, rot=90, color='blue')
    _= plt.ylabel(name)
    ax.get_figure().tight_layout()
    ax.get_figure().savefig(os.path.join(PATH_TEMP, 'bar_teams-scores_%s.pdf' % col))
../_images/notebooks_ANHIR_evaluation-board_37_0.png
../_images/notebooks_ANHIR_evaluation-board_37_1.png
../_images/notebooks_ANHIR_evaluation-board_37_2.png

Transform the case format data to be simple form with extra colums for used and case ID to be able to draw a violine plot later.

[20]:
dfs_cases = []
for usr in users:
    df = pd.DataFrame(user_cases[usr]).T
    df['method'] = usr
    df['case'] = df.index
    dfs_cases.append(df)
df_cases = pd.concat(dfs_cases).reset_index()

for col in df_cases.columns:
    try:
        df_cases[col] = pd.to_numeric(df_cases[col])
    except Exception:
        print('skip not numerical column: "%s"' % col)
# df_cases.head()
skip not numerical column: "Status"
skip not numerical column: "name-reference"
skip not numerical column: "name-source"
skip not numerical column: "name-tissue"
skip not numerical column: "type-tissue"
skip not numerical column: "method"

Showing several violine plots

[21]:
def _format_ax(ax, name, use_log=False, vmax=None):
    plt.xticks(rotation=60)
    if use_log:
        ax.set_yscale('log')
    if vmax:
        ax.set_ylim([0, vmax])
    ax.grid(True)
    ax.set_xlabel('')
    ax.set_ylabel(name)
    ax.get_figure().tight_layout()
[22]:
sns.set(style="whitegrid", palette="pastel", color_codes=True)
for field, name, log, vmax, bw in [('rTRE-Median', 'MrTRE', True, None, 0.01),
                                   # ('rTRE-Median', '', False, 0.1, 0.01),
                                   ('rTRE-Max', 'MxrTRE', False, 0.2, 0.02),
                                   ('rTRE-Median_rank', 'RMrTRE', False, None, 0.05),
                                   ('Norm-Time_minutes', 'Time [min]', True, 200, 0.1)]:
    fig, ax = plt.subplots(figsize=(8, 4))
    sns.violinplot(ax=ax, data=df_cases, hue='Status', y=field, x='method', order=users_ranked,
                   cut=0, split=True, bw=bw, scale='area', scale_hue=True, inner='quartile',
                   palette={"evaluation": "r", "training": "b"}, linewidth=1.)
    _format_ax(ax, name, log, vmax)
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=2, fancybox=True, shadow=False)
    ax.get_figure().savefig(os.path.join(PATH_TEMP, 'hist_teams-scores_%s.pdf' % field))
/home/jb/.local/lib/python3.6/site-packages/matplotlib/axes/_base.py:3477: UserWarning: Attempted to set non-positive ylimits for log-scale axis; invalid limits will be ignored.
  'Attempted to set non-positive ylimits for log-scale axis; '
../_images/notebooks_ANHIR_evaluation-board_42_1.png
../_images/notebooks_ANHIR_evaluation-board_42_2.png
../_images/notebooks_ANHIR_evaluation-board_42_3.png
../_images/notebooks_ANHIR_evaluation-board_42_4.png
[23]:
for field, name, log, vmax, bw in [('rTRE-Median', 'MrTRE', True, None, 0.01),
                                   # ('rTRE-Median_rank', 'RMrTRE', False, None, 0.05),
                                   ('Norm-Time_minutes', 'Time [min]', True, 50, 0.1)]:
    for st, dfg in df_cases.groupby('Status'):
        # methods_ = list(dfg['method'].unique())
        vals_ = [dfg[dfg['method'] == m][field].values for m in users_ranked]
        dfg_ = pd.DataFrame(np.array(vals_).T, columns=users_ranked)
        fig, ax = plt.subplots(figsize=(7, 4))
        bp = dfg_.plot.box(ax=ax, showfliers=True, showmeans=True,
                           color=dict(boxes='b', whiskers='b', medians='g', caps='k'),
                           boxprops=dict(linestyle='-', linewidth=2),
                           flierprops=dict(linestyle='-', linewidth=2),
                           medianprops=dict(linestyle='-', linewidth=2),
                           whiskerprops=dict(linestyle='-.', linewidth=2),
                           capprops=dict(linestyle='-', linewidth=2),
                           return_type='dict')
        _format_ax(ax, name, log, vmax)
        ax.set_title(st)
        ax.get_figure().savefig(os.path.join(PATH_TEMP, 'boxbar_teams-scores_%s_%s.pdf' % (field, st)))
/home/jb/.local/lib/python3.6/site-packages/matplotlib/axes/_base.py:3477: UserWarning: Attempted to set non-positive ylimits for log-scale axis; invalid limits will be ignored.
  'Attempted to set non-positive ylimits for log-scale axis; '
/home/jb/.local/lib/python3.6/site-packages/matplotlib/axes/_base.py:3477: UserWarning: Attempted to set non-positive ylimits for log-scale axis; invalid limits will be ignored.
  'Attempted to set non-positive ylimits for log-scale axis; '
../_images/notebooks_ANHIR_evaluation-board_43_1.png
../_images/notebooks_ANHIR_evaluation-board_43_2.png
../_images/notebooks_ANHIR_evaluation-board_43_3.png
../_images/notebooks_ANHIR_evaluation-board_43_4.png

Visualise global results

[24]:
fields = ['Average-Max-rTRE', 'Average-Max-rTRE-Robust',
          'Average-Median-rTRE', 'Average-Median-rTRE-Robust',
          'Median-Median-rTRE', 'Median-Median-rTRE-Robust',
          'Average-Rank-Max-rTRE', 'Average-Rank-Median-rTRE',
          'Average-Norm-Time', 'Average-Norm-Time-Robust',
          'Average-Robustness',]
df = pd.DataFrame(user_aggreg)[users_ranked].T[fields]
df['Average-Weakness'] = 1 - df['Average-Robustness']
del df['Average-Robustness']

radar = RadarChart(df, fig=plt.figure(figsize=(9, 8)), colors=list_methods_colors(df.index))
radar.ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.05),
                ncol=int(len(users) / 1.8), fancybox=True, shadow=True)
radar.fig.tight_layout()
radar.fig.savefig(os.path.join(PATH_TEMP, 'radar_teams-scores.pdf'))
/home/jb/.local/lib/python3.6/site-packages/matplotlib/figure.py:2299: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
  warnings.warn("This figure includes Axes that are not compatible "
../_images/notebooks_ANHIR_evaluation-board_45_1.png

Visualise partial rankings

With given metrics compute ranking among all participants, so each user has a unique colour and then the users are sorted in ascending order…

[25]:
cols = list(filter(lambda c: not any([n in c for n in ('Robust', 'Time', 'landmarks')]), cols_general))
# cols = ['Average-Rank-Median-rTRE', 'Average-Median-rTRE', 'Average-Max-rTRE']
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols]

cols_rank = list(filter(lambda c: 'Rank-' in c, cols))
# https://matplotlib.org/3.1.1/api/markers_api.html
fig, (ax, _) = draw_scatter_double_scale(
    dfx, colors=list_methods_colors(users_ranked), ax_decs={'Median rTRE': None, 'Rank Median rTRE': cols_rank},
    idx_markers=list_methods_markers(users_ranked), xlabel='Methods', figsize=(1 + len(dfx.columns) * 0.8, 6),
    legend_style=dict(loc='upper center', bbox_to_anchor=(0.5, 1.25), ncol=5),
    plot_style=dict(markerfacecolor="None"))
# DEPRICATED visualisation
# ax = dfx.T.plot(style='X', cmap=plt.get_cmap('nipy_spectral', len(dfx)), figsize=(len(dfx) / 2, 4), grid=True)
# _= plt.xlim([-0.5, len(cols) - 0.5])
# _= plt.yscale('log'), plt.ylabel('Median-rTRE [%]')

# ax.legend(loc='upper center', bbox_to_anchor=(1.2, 1.0), ncol=1)
# _= plt.xticks(range(len(cols)), [col_metric_rename(c) for c in cols], rotation=90)
ax.set_xlabel('')
ax.set_xticks(range(len(cols)))
ax.set_xticklabels(list(map(col_metric_rename, cols)), rotation=45, ha="center")
# ax.legend(, fancybox=True, shadow=True)
fig.tight_layout()
fig.savefig(os.path.join(PATH_TEMP, 'scat_teams-scores.pdf'))
../_images/notebooks_ANHIR_evaluation-board_47_0.png

Show scatters for rTRE and Rank measures separatly

[26]:
def _ext_cname(col):
    return [col + st for st in ('', '_' + VAL_STATUS_TRAIN, '_' + VAL_STATUS_TEST)]
def _drop_cname(col):
    return col.replace('_' + VAL_STATUS_TRAIN, ' (train)').replace('_' + VAL_STATUS_TEST, ' (eval.)')

for cols, desc, use_log in [(_ext_cname('Average-Rank-Median-rTRE') + _ext_cname('Average-Rank-Max-rTRE'), 'Avg. Ranks', False),
                            (_ext_cname('Average-Median-rTRE') + _ext_cname('Average-Max-rTRE'), 'rTRE', True)]:
    dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols]
    # colors = plt.get_cmap('nipy_spectral', len(dfx))
    fig, (ax, _) = draw_scatter_double_scale(
        dfx, colors=list_methods_colors(users_ranked), ax_decs={desc: None},
        idx_markers=list_methods_markers(users_ranked), xlabel='Methods', figsize=(1 + len(dfx.columns) * 1., 5),
        legend_style=dict(loc='upper center', bbox_to_anchor=(0.5, 1.35), ncol=5))
    ax.set_xticks(range(len(cols)))
    ax.set_xticklabels(list(map(lambda c: col_metric_rename(_drop_cname(c)), cols)), rotation=45, ha="center")
    _format_ax(ax, desc, use_log, vmax=None)
    name = ''.join(filter(lambda s: s not in '(.)', desc)).replace(' ', '-')
    fig.savefig(os.path.join(PATH_TEMP, 'scat_teams-scores-%s.pdf' % name))
../_images/notebooks_ANHIR_evaluation-board_49_0.png
../_images/notebooks_ANHIR_evaluation-board_49_1.png

Show ranking over complete dataset as heatmap

[27]:
cols_ = list(filter(lambda c: 'Robust' not in c and 'land' not in c, cols_general))
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols_]
dfx.columns = list(map(col_metric_rename, dfx.columns))
fig = draw_matrix_user_ranking(dfx, higher_better=False, fig=plt.figure(figsize=(4, 4)))
fig.savefig(os.path.join(PATH_TEMP, 'chess_teams-ranking_all.pdf'))
/home/jb/Dropbox/Workspace/BIRL/birl/utilities/evaluate.py:223: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  ranking = np.zeros(df_stat.as_matrix().shape)
../_images/notebooks_ANHIR_evaluation-board_51_1.png

Show ranking for training and evalaution image pairs as heatmap

[28]:
cols_ = list(filter(lambda c: 'Robust' not in c and 'land' not in c, cols_train_test))
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols_]
dfx.columns = list(map(col_metric_rename, dfx.columns))
fig = draw_matrix_user_ranking(dfx, higher_better=False)
fig.savefig(os.path.join(PATH_TEMP, 'chess_teams-ranking_train-test.pdf'))
/home/jb/Dropbox/Workspace/BIRL/birl/utilities/drawing.py:494: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  fig, _ = plt.subplots(figsize=np.array(df_stat.as_matrix().shape[::-1]) * 0.35)
../_images/notebooks_ANHIR_evaluation-board_53_1.png

Visual statistic over tissue types

Present some statistis depending on the tissue types…

[29]:
cols_all = pd.DataFrame(user_aggreg).T.columns
# col_avg_med_tissue = sorted(filter(
#     lambda c: 'Median-rTRE_tissue' in c and not 'Rank' in c and 'Median-Median-' not in c, cols_all))
# col_robust_tissue = sorted(filter(
#     lambda c: 'Average-Robustness_tissue' in c and not 'Rank' in c, cols_all))
cols_avg_med_tissue_ = {st: sorted(filter(
    lambda c: 'Median-rTRE_%stissue' % st in c and not 'Rank' in c and 'Median-Median-' not in c, cols_all))
    for st in ('', VAL_STATUS_TRAIN + '_', VAL_STATUS_TEST + '_')}
cols_robust_tissue_ = {st: sorted(filter(
    lambda c: 'Average-Robustness_%stissue' % st in c and not 'Rank' in c, cols_all))
    for st in ('', VAL_STATUS_TRAIN + '_', VAL_STATUS_TEST + '_')}
[30]:
# [(col_avg_med_tissue, 'Avg. Median rTRE', 'Average-Median-rTRE_tissue_', True),
#  (col_robust_tissue, 'Avg. Robustness', 'Average-Robustness_tissue_', False),]
LUT_TRAIN_TEST = {
    VAL_STATUS_TRAIN + '_': '(train)',
    VAL_STATUS_TEST + '_': '(eval.)',
}

params_tuple = [(cols_avg_med_tissue_[st],
                 'Avg. Median rTRE %s' % LUT_TRAIN_TEST.get(st, ''),
                 'Average-Median-rTRE_%stissue_' % st, True)
                for st in ('', VAL_STATUS_TRAIN + '_', VAL_STATUS_TEST + '_')]
params_tuple += [(cols_robust_tissue_[st],
                  'Avg. Robust %s' % LUT_TRAIN_TEST.get(st, ''),
                  'Average-Robustness_%stissue_' % st, False)
                 for st in ('', VAL_STATUS_TRAIN + '_', VAL_STATUS_TEST + '_')]

for cols, desc, drop, use_log in params_tuple:
    print('"%s" with sample columns: %s' % (desc, cols[:3]))
    dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols]
    # colors = plt.get_cmap('nipy_spectral', len(dfx))
    fig, (ax, _) = draw_scatter_double_scale(
        dfx, colors=list_methods_colors(users_ranked), ax_decs={desc: None},
        idx_markers=list_methods_markers(users_ranked), xlabel='Methods', figsize=(1 + len(dfx.columns) * 0.8, 6),
        legend_style=dict(loc='upper center', bbox_to_anchor=(0.5, 1.35), ncol=5))
    # DEPRICATED visualisation
    # ax = dfx.T.plot(style='X', cmap=plt.get_cmap('nipy_spectral', len(dfx)), figsize=(len(dfx) / 2 + 1, 5), grid=True)
    # ax.legend(loc='upper center', bbox_to_anchor=(1.2, 1.0), ncol=1)
    ax.set_xticks(range(len(cols)))
    ax.set_xticklabels(list(map(lambda c: col_metric_rename(col_tissue_remane(c.replace(drop, ''))), cols)),
                       rotation=45, ha="center")
    _format_ax(ax, desc, use_log, vmax=None)
    name = ''.join(filter(lambda s: s not in '(.)', desc)).replace(' ', '-')
    fig.savefig(os.path.join(PATH_TEMP, 'scat_teams-scores_tissue-%s.pdf' % name))
"Avg. Median rTRE " with sample columns: ['Average-Median-rTRE_tissue_COAD', 'Average-Median-rTRE_tissue_gastric', 'Average-Median-rTRE_tissue_human-breast']
"Avg. Median rTRE (train)" with sample columns: ['Average-Median-rTRE_training_tissue_COAD', 'Average-Median-rTRE_training_tissue_gastric', 'Average-Median-rTRE_training_tissue_human-breast']
"Avg. Median rTRE (eval.)" with sample columns: ['Average-Median-rTRE_evaluation_tissue_COAD', 'Average-Median-rTRE_evaluation_tissue_gastric', 'Average-Median-rTRE_evaluation_tissue_human-breast']
"Avg. Robust " with sample columns: ['Average-Robustness_tissue_COAD', 'Average-Robustness_tissue_gastric', 'Average-Robustness_tissue_human-breast']
"Avg. Robust (train)" with sample columns: ['Average-Robustness_training_tissue_COAD', 'Average-Robustness_training_tissue_gastric', 'Average-Robustness_training_tissue_human-breast']
"Avg. Robust (eval.)" with sample columns: ['Average-Robustness_evaluation_tissue_COAD', 'Average-Robustness_evaluation_tissue_gastric', 'Average-Robustness_evaluation_tissue_human-breast']
../_images/notebooks_ANHIR_evaluation-board_56_1.png
../_images/notebooks_ANHIR_evaluation-board_56_2.png
../_images/notebooks_ANHIR_evaluation-board_56_3.png
../_images/notebooks_ANHIR_evaluation-board_56_4.png
../_images/notebooks_ANHIR_evaluation-board_56_5.png
../_images/notebooks_ANHIR_evaluation-board_56_6.png
[31]:
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols_avg_med_tissue_['']]
dfx.columns = list(map(lambda c: col_metric_rename(c).replace('_tissue', '').replace('Average', 'Avg'),
                       dfx.columns))
fig = draw_matrix_user_ranking(dfx, higher_better=False, fig=plt.figure(figsize=(5, 5)))
fig.savefig(os.path.join(PATH_TEMP, 'chess_teams-ranking_tissue-avg.pdf'))
../_images/notebooks_ANHIR_evaluation-board_57_0.png
[32]:
dfx = pd.DataFrame(user_aggreg)[users_ranked].T[cols_robust_tissue_['']]
dfx.columns = list(map(lambda c: col_tissue_remane(c).replace('Average-Robustness_tissue', 'Avg-Robust'),
                       dfx.columns))
fig = draw_matrix_user_ranking(dfx, higher_better=True, fig=plt.figure(figsize=(5, 5)))
fig.savefig(os.path.join(PATH_TEMP, 'chess_teams-ranking_tissue-robust.pdf'))
../_images/notebooks_ANHIR_evaluation-board_58_0.png

Calculating pairwise significance

[33]:
cases=[k for k in submissions[users[0]]['cases'].keys() if submissions[users[0]]['cases'][k][FIELD_STATUS] == 'evaluation' ]
print ('# cases: ', len(cases))
# cases:  251
[34]:
import matplotlib.colors
from scipy.stats import wilcoxon

# users_ = users_ranked
# JK requested having this order...
users_ = ['MEVIS', 'CKVST', 'AGH', 'UPENN', 'TUB', 'TUNI', 'DROP', 'RVSS', 'ANTs', 'UA', 'Elastix', 'bUnwarpJ', 'NiftyReg']
r = {m: list(map(lambda c: submissions[m]['cases'][c]['rTRE-Median'], cases)) for m in users}
x = r[users[0]]
y = r[users[1]]

ps = np.zeros((len(users), len(users)))
for i, j in ((i, j) for i in range(len(users)) for j in range(len(users))):
    if i != j:
        s, p = wilcoxon(r[users_[i]], r[users_[j]], alternative='greater')
        if i > j and p > 0.01:
            print('Difference is not significant between %s and %s for p=%f' % (users[i], users[j], p))
        ps[i, j] = p
    else:
        ps[i, j] = 0.5

cvals = [0., 0.01, 0.5, 0.99, 1.0]
colors = [[0, 0, 1], [0.25, 0.25, 0.75], [0.5, 0.5, 0.5], [0.75, 0.25, 0.25], [1., 0, 0]]
cvals_norm = plt.Normalize(min(cvals), max(cvals))
cvals_color = list(zip(map(cvals_norm, cvals), colors))
cmap = matplotlib.colors.LinearSegmentedColormap.from_list("", cvals_color)
plt.imshow(ps, interpolation='nearest', cmap=cmap, norm=cvals_norm)
plt.colorbar()
_= plt.xticks(range(len(users)), users_, rotation=90)
_= plt.yticks(range(len(users)), users_)
plt.gca().get_figure().savefig(os.path.join(PATH_TEMP, 'wilcoxon_teams_rTRE.pdf'), bbox_inches='tight')
Difference is not significant between ANTs and AGH for p=0.346996
Difference is not significant between CKVST and AGH for p=0.013812
Difference is not significant between CKVST and ANTs for p=0.415407
Difference is not significant between DROP and ANTs for p=0.444396
Difference is not significant between DROP and CKVST for p=0.080876
Difference is not significant between TUB and RVSS for p=0.325578
Difference is not significant between TUNI and TUB for p=0.422872
Difference is not significant between UA and TUB for p=0.078941
Difference is not significant between UA and TUNI for p=0.430366
../_images/notebooks_ANHIR_evaluation-board_61_1.png
[ ]: