In [1]:
from IPython.display import HTML
from IPython.display import Markdown
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import pylab
import numpy as np
from IPython.display import Image
import sys
import seaborn as sns
from matplotlib.colors import LogNorm
from cycler import cycler 
import itertools
In [2]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
code_show=true; 
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
<style>.container { width:95% !important; }</style>
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')
Out[2]:
The raw code for this IPython notebook is by default hidden for easier reading. To toggle on/off the raw code, click here.
In [3]:
# Set the date to previous business day
myd = (pd.datetime.today() - pd.tseries.offsets.BDay(1))
myDate = myd.strftime('%Y%m%d')
#override day here
#myDate ='20180914'
In [4]:
plotColors=[(0,0,153/255),(75/255,155/255,1),(210/255,210/255,210/255),
            (180/255,225/255,1),(110/255,110/255,110/255),(5/255,99/255,193/255),
            (110/255,110/255,110/255),(110/255,215/255,0),(215/255,235/255,1),(165/255,230/255,0)]
marketColors = { 'bg': { 'XEUR' : '#727272', 'XETR' : '#181813' }, 
                'text': { 'XEUR' : '#333333', 'XETR' : '#0c519e' }, 
                'marker': { 'XEUR' : plotColors[0], 'XETR' : plotColors[1] } }
In [5]:
# Define some global values
# closup low
t3a_t3n_lowmin = 0
t3a_t3n_lowmax = 2000

mint3a = 2500
maxt3a = 4000
mint3n = mint3a
maxt3n = 4000
binWidth = 2
binrange = np.arange(mint3a,maxt3a,binWidth)
windowsize = 25
# dictionaries we need later
partCondition=dict()
ts=dict()
t9_t3n=dict()
t9_t9d=dict()
t3a_t3n=dict()
t9d_t3a=dict()
t9d_t3n=dict()
net=dict()
t3a_t3n_estimate=dict()
t9d_t3a_estimate=dict()
t9_t9d_median=dict()
t9d_t3a_median=dict()
t3a_t3n_residual=dict()
t3nCorr=dict()
t9dCorr=dict()
t9d_t3n_corr= dict()
emptySeries = dict()
partitions = dict()
allpartitions = list()
topproducts = dict()
marketNames = { 'XEUR': 'Eurex', 'XETR' : 'Xetra'}
In [6]:
# a couple of useful functions
# convert missing values to zero
def timeConverter(num):
    try:
        return np.int64(num)
    except:
        return np.int64(0)
    
# percentile calculation
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_
In [7]:
df = pd.DataFrame()
dfall = pd.DataFrame()
def ReadFile(filename):
    return pd.read_csv(fileName,sep=";", 
                         converters = {'ETICaptTime'  : timeConverter, 
                                       'EOBICaptTime' : timeConverter, 
                                       'TransactTime' : timeConverter,
                                       'AggressorTime': timeConverter,
                                       'RequestTime'  : timeConverter },
                      usecols=['MarketSegmentID', 'ExecID',
                               'PartitionID', 'AggressorSide', 'LastQty', 'LastPx',
                               'RequestTime',   # t3n
                               'AggressorTime', # t5
                               'TransactTime',  # t9
                               'EOBICaptTime',  # t9d
                               'ETICaptTime'    # t3a
                              ],
                    dtype = {'MarketSegmentID':np.int32,  
                             'ExecID':np.int64,
                             'PartitionID':np.int8,
                             'AggressorSide':np.int8,
                             'LastQty':np.int64},
                     ).sort_values(by=['ExecID'])
In [8]:
def CalculateTriggers(df):
    # search for the latest EOBI packet that precedes a given aggressive order by <minDelay> ns
    # assume this packet was the trigger for the aggressing order
    # minimum delay is cable length plus t9d to t9a, 2 x 181.5 m * 5 ns/m + 915 ns [delay t_9d => t_9a]
    minDelay=2730 
    t_3a = df['ETICaptTime'].values
    t_9d = df['EOBICaptTime'].values
    t_9  = df['TransactTime'].values
    t_3  = df['RequestTime'].values
    msID = df['MarketSegmentID'].values
    # t_9d of triggering EOBI packet
    df['trigger_t9d'] = t_9d[t_9d.searchsorted(t_3a-minDelay)-1]
    # t_9 of triggering EOBI packet
    df['trigger_t9'] = t_9[t_9d.searchsorted(t_3-minDelay)-1]
    # t_9 of triggering EOBI packet
    df['trigger_t3'] = t_3[t_9d.searchsorted(t_3a-minDelay)-1]
    #triggering market segment ID
    df['triggermsID'] = msID[t_9d.searchsorted(t_3a-minDelay)-1]
    # time in datetime format for later plotting
    df['dt']=pd.to_datetime(df['RequestTime'])
    # calculate the delay trigger to capture time (t_9d to t_3a)
    df['t9d_t3a'] =  (df['ETICaptTime'] - df['trigger_t9d']) * (df['ETICaptTime'] > 0) 
    # calculate the delay trigger to gateway time (t_9d to t_3n)
    df['t9d_t3n'] =  (df['RequestTime'] - df['trigger_t9d']) * (df['RequestTime'] > 0)
    # calculate the delay trigger to gateway time (t_9 to t_3n)
    df['t9_t3n'] =  (df['RequestTime'] - df['trigger_t9']) * (df['RequestTime'] > 0)
    # calculate the app to wire delay of triggering EOBI packet (t_9 to t_9d)
    df['t9_t9d'] =  (df['trigger_t9d'] - df['trigger_t9']) * (df['RequestTime'] > 0)
In [9]:
def CalculateLatency(df):
    t_3    = df['RequestTime'].values
    meIn   = df['AggressorTime'].values
    execID = df['ExecID'].values
    mdOut  = df['TransactTime'].values
    df['t3n_t5'] = meIn-t_3
    df['t5_t7'] = execID - meIn
    df['t7_t9'] = mdOut - execID
    # EOBI latency app to wire (t9 to t9d)
    df['t9_t9d'] = df['EOBICaptTime'] - df['TransactTime']
    # account for offset between T7 server times and network capture times
    timeOffset = 0 
    # latency of ETI request through network (t3a to t3n)
    df['t3a_t3n'] = df['RequestTime'] - df['ETICaptTime'] - timeOffset
    # latency of ETI request through network (t3a to t3n)
    df['t3a_t7'] = df['ExecID'] - df['ETICaptTime'] - timeOffset
    
    
    fastCondition = (df['t3a_t3n'] < t3a_t3n_lowmax) & (df['t3a_t3n'] > t3a_t3n_lowmin)
    allpartitions = list()
    for market in ['XEUR','XETR']:        
        partitions[market] = list(dfall[dfall['market']==market].dropna().groupby('PartitionID').size().index)
        allpartitions = allpartitions + partitions[market]
        t9_t9d_median[market] = df[df['market'] == market].dropna()['t9_t9d'].median()
        t3nCorr[market] = df[(df['market'] == market) & fastCondition].dropna()['t3a_t3n'].median()
        t9dCorr[market] = df[df['market'] == market].dropna()['t9_t9d'].median()

        for partition in partitions[market]:
            emptySeries[partition] = pd.Series()
            partCondition[partition]=(df['PartitionID'] == partition) & fastCondition 
            condition=partCondition[partition]
            ts[partition]      = df['RequestTime'][condition].dropna()
            t9_t3n[partition]  = df['t9_t3n'][condition].dropna()
            t9_t9d[partition]  = df['t9_t9d'][condition].dropna()
            t3a_t3n[partition] = df['t3a_t3n'][condition].dropna()
            t9d_t3a[partition] = df['t9d_t3a'][condition].dropna()
            t9d_t3n[partition] = df['t9d_t3n'][condition].dropna()
            net[partition]     = df['t3a_t3n'][condition].dropna()
            t3a_t3n_estimate[partition] = df['t3a_t3n'][condition].dropna().ffill().rolling(window=windowsize, center=True).median().ffill().bfill()
            t9d_t3a_estimate[partition] = t9d_t3n[partition] - t3a_t3n_estimate[partition]
            t9d_t3a_median[partition] = t9_t3n[partition]  - t9dCorr[market] - t3nCorr[market] 
            t3a_t3n_residual[partition] = net[partition] - t3a_t3n_estimate[partition]
            t9d_t3n_corr[partition] = t9d_t3n[partition] - t3nCorr[market] 

    for partition in allpartitions:
        a = dfall[dfall['PartitionID'] == partition].dropna().groupby(['sprod']).size() # ['LastQty'].sum()
        topproducts[partition] = list(a.sort_values(ascending=False).head(5).index)
In [10]:
def TranslateProds(df):
    prodFile = '/local/neusseb/workspace/mml-tools/hpt/products.csv'
    products = pd.read_csv(prodFile, sep=";")
    cgFile = '/local/neusseb/workspace/mml-tools/hpt/capgroups.csv'
    capgroups = pd.read_csv(cgFile, sep=";")
    partFile = '/local/neusseb/workspace/mml-tools/hpt/partitions.csv'
    ps = pd.read_csv(partFile, sep=";")
    mFile = '/local/neusseb/workspace/mml-tools/hpt/markets.csv'
    ms = pd.read_csv(mFile, sep=";")
    prodMap = dict(zip(products.ID,products.Name))
    prodToCap = dict(zip(products.ID,products.ParentID))
    capToPart = dict(zip(capgroups.ID,capgroups.ParentID))
    part2Market=dict(zip(ps.ID,ps.ParentID))
    market2Name =dict(zip(ms.ID,ms.Name))
    df['market'] = df.MarketSegmentID.map(prodToCap).map(capToPart).map(part2Market).map(market2Name)
    df['prod'] = df.MarketSegmentID.map(prodToCap).map(capToPart).map(part2Market).map(market2Name) + '-' + df.MarketSegmentID.map(prodMap)
    df['sprod'] =  df.MarketSegmentID.map(prodMap)
    df['triggerProd'] = df.triggermsID.map(prodToCap).map(capToPart).map(part2Market).map(market2Name) + '-' + df.triggermsID.map(prodMap)
In [11]:
def DrawPartitions(market, type, minTime, maxTime, lowlimit, highlimit,plotEstimate, showMedian = True):
    # subplots
    cols = 2
    rows = 5
    pylab.rcParams['figure.figsize'] = (20, 12)
    f, axarr = plt.subplots(rows, cols, sharex='col',sharey='row')
    miny = np.round(df[type][(df[type]<highlimit) & (df['market'] == market) ].quantile(0.01),-2)-100
    maxy = np.round(df[type][(df[type]<highlimit) & (df['market'] == market)].quantile(0.99),-2)+100
    timeCondition = (df['RequestTime'] >= minTime) & (df['RequestTime'] <= maxTime) 
    for partition in partitions[market]:
        condition = (df['PartitionID'] == partition) & timeCondition        
        #cr = plotColors[partition- min(partitions[market])]
        cr = marketColors['marker'][market]
        sp = axarr[(partition - min(partitions[market]))// cols,(partition + min(partitions[market]))% cols ]
        sp.grid(color='grey', linestyle='dotted', linewidth=1, axis='y')
        sp.set_facecolor('xkcd:white')
        sp.set_ylim([miny,maxy])
        ts =   df['dt'].where(condition).dropna()
        net =  df[type].where(condition).dropna()
        if (showMedian):
            sp.set_title('Partition %s (median latency %d ns)'%(partition,net.median()), fontsize=18,color=marketColors['text'][market])  
        else:
            sp.set_title('Partition %s'%(partition), fontsize=18,color=marketColors['text'][market])  
        markerSize = 3   
        if (plotEstimate):
            windowsize=15
            markerSize = 3
            sp.plot_date(ts, net,color=cr, tz=cet, marker='.', markersize=markerSize)
            estimate = df[type][condition].dropna().where((df[type] < maxy) & (df[type] > miny) ).ffill().rolling(window=windowsize, center=True).median().bfill().ffill()
            sp.plot_date(ts, estimate,color='black', tz=cet,lw=1,ls='-',markersize=0)
        else:
            sp.plot_date(ts, net,color=cr, tz=cet, marker='.', markersize=markerSize)
    dummy = plt.suptitle('%s\n%s for executions'%(marketNames[market], type ), fontsize=26, color=marketColors['text'][market])
    plt.subplots_adjust( wspace=0.2, hspace=0.5)
    plt.show()
In [12]:
def DrawPartitionSeries(market, series, minTime, maxTime, lowlimit, highlimit):
    # subplots
    cols = 2
    rows = 5
    pylab.rcParams['figure.figsize'] = (20, 12)
    f, axarr = plt.subplots(rows, cols, sharex='col',sharey='row')
    for partition in partitions[market]:
        timeCondition = (ts[partition] >= minTime) & (ts[partition] <= maxTime) 
        condition = timeCondition        
        #cr = plotColors[partition- min(partitions[market])]
        cr = marketColors['marker'][market]
        sp = axarr[(partition - min(partitions[market]))// cols,(partition + min(partitions[market]))% cols ]
        sp.grid(color='grey', linestyle='dotted', linewidth=1, axis='y')
        sp.set_facecolor('xkcd:white')
        sp.set_ylim([lowlimit,highlimit])
        myTs =   pd.to_datetime(ts[partition].where(condition).dropna())
        mySeries = series[partition][condition].dropna()
        #sp.set_title('Partition %s (median latency %d ns)'%(partition,mySeries.median()), fontsize=18,color=marketColors['text'][market])   
        markerSize = 3   
        sp.plot_date(myTs, mySeries,color=cr, tz=cet, marker='.', markersize=markerSize)
    dummy = plt.suptitle('%s\n%s for executions'%(marketNames[market], type ), fontsize=26, color=marketColors['text'][market])
    plt.subplots_adjust( wspace=0.2, hspace=0.5)
    plt.show()
In [13]:
def DrawPartitionHist(series, series2, binRange, market, title, printQuartiles):
    # subplots
    cols = 2
    rows = 5
    pylab.rcParams['figure.figsize'] = (20, 12)
    f, axarr = plt.subplots(rows, cols, sharex='col',sharey='row')
    for partition in partitions[market]:
        sp = axarr[((partition - min(partitions[market]))// cols),(partition + min(partitions[market]))% cols ]
        #cr = plotColors[partition- min(partitions[market])]
        cr = marketColors['marker'][market]
        sp.grid(color='grey', linestyle='dotted', linewidth=1, axis='x')
        sp.set_facecolor('xkcd:white')
        if (printQuartiles):
            sp.set_title('Partition %s (quartile %d, %d, %d)'%(partition, series[partition].quantile(0.25), series[partition].quantile(0.5), series[partition].quantile(0.75)), fontsize=18) 
        else:
            sp.set_title('Partition %s '%(partition), fontsize=18) 
        
        sp.hist( series[partition],bins=binRange,color=cr, alpha=1)
        if (series2[partition].size > 0):
            sp.hist( series2[partition],bins=binRange,color='grey', alpha=0.5)
    
    dummy = plt.suptitle(title, fontsize=26, color=marketColors['text'][market])
    plt.subplots_adjust( wspace=0.2, hspace=0.5)
    plt.show()
In [14]:
def DrawPartitionHistSingle(givenPartition, series, series2, binMin,binMax,bucket, market, title,seriesName,seriesName2, printQuartiles):
    # subplots
    cols = 1
    rows = 1
    yt = [0,25,50,75]
    ytick =[int(i/bucket) for i in yt]
    binRange = np.arange(binMin,binMax,bucket)    
    pylab.rcParams['figure.figsize'] = (30,15)
    f, axarr = plt.subplots(rows, cols, sharex='col',sharey='row')
    for partition in [givenPartition]:
        sp = axarr
        #cr = plotColors[partition- min(partitions[market])]
        cr = marketColors['marker'][market]
        sp.grid(color='grey', linestyle='dotted', linewidth=1, axis='x')
        sp.set_facecolor('xkcd:white')

        
        sp.hist( series[partition],bins=binRange,color='blue', alpha=1)
        if (series2[partition].size > 0):
            sp.hist( series2[partition],bins=binRange,color='grey', alpha=0.5)
    sp.set_xticks(range(binMin,binMax, 200))
    sp.set_xticklabels(range(binMin,binMax, 200),size=30, weight='bold')
    sp.set_yticklabels(ytick,size=20, weight='bold')
    sp.set_yticks(yt)
    #sp.set_yticklabels([0,25,50,75],size=20, weight='bold')

    plt.savefig('%s_%s_%s.png'%(givenPartition, seriesName, seriesName2),dpi=None, facecolor='w', edgecolor='w',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches=None, pad_inches=0.1,
        frameon=None)
    plt.show()
In [15]:
def DrawScatter(series1, series2, xlabel, ylabel, tmin, tmax, title):
    pylab.rcParams['figure.figsize'] = (30, 30)
    scatter_axes = plt.subplot2grid((5, 5), (1, 0), rowspan=4, colspan=4)
    x_hist_axes  = plt.subplot2grid((5, 5), (0, 0), colspan=4, sharex=scatter_axes)
    y_hist_axes  = plt.subplot2grid((5, 5), (1, 4), rowspan=4,sharey=scatter_axes)
    
    plt.suptitle(title, fontsize=26)
    scatter_axes.set_xlim([tmin,tmax])
    scatter_axes.set_ylim([tmin,tmax])
    
    y_hist_axes.set_ylim([tmin,tmax])
    x_hist_axes.set_xlim([tmin,tmax])
    binWidth = 1
    binrangex = np.arange(tmin,tmax,binWidth)
    binrangey = binrangex
    scatter_axes.set_xlabel(xlabel)
    scatter_axes.set_ylabel(ylabel)
    maxHist = np.round(series1.value_counts().max()*binWidth,-2)+50
    
    x_hist_axes.set_ylim([0,maxHist])
    y_hist_axes.set_xlim([0,maxHist])
    plt.setp(x_hist_axes.get_xticklabels(), visible=False)
    plt.setp(y_hist_axes.get_yticklabels(), visible=False)

    scatter_axes.scatter(series1,series2, marker='.',s=2)
    scatter_axes.grid(color='grey', linestyle='dotted', linewidth=1)
    
    
    hist, edges = np.histogram(series1,  bins=binrangex)
    maxima = list(map(lambda x: (edges[x],hist[x]),argrelmax(hist, order=15)))
    #print(list(zip(maxima[0][0],maxima[0][1])))
    #x_hist_axes.bar(edges[:-1], hist, width=0.5,linewidth = 0)
    for a in list(zip(maxima[0][0],maxima[0][1])):
        if (a[1] > 1E-04):
            x_hist_axes.annotate('%s'%a[0], xy=a,xycoords='data', textcoords='offset points',xytext=(-15,15))
    
    x_hist_axes.hist(series1, bins=binrangex, alpha=0.75)
    x_hist_axes.grid(color='grey', linestyle='dotted', linewidth=1, axis='x')
    
    
    
    hist, edges = np.histogram(series2,  bins=binrangey)
    maxima = list(map(lambda x: (edges[x],hist[x]),argrelmax(hist, order=15)))
    #print(list(zip(maxima[0][0],maxima[0][1])))
    #x_hist_axes.bar(edges[:-1], hist, width=0.5,linewidth = 0)
    for a in list(zip(maxima[0][1],maxima[0][0])):
        if (a[1] > 1E-04):
            y_hist_axes.annotate('%s'%a[1], xy=a,xycoords='data', textcoords='offset points',xytext=(5,0))
    
    
    y_hist_axes.hist(series2, orientation='horizontal', bins=binrangey, alpha=0.75)
    
    y_hist_axes.grid(color='grey', linestyle='dotted', linewidth=1, axis='y')
    
    scatter_axes.set_xlabel(xlabel, fontsize=16)
    scatter_axes.set_ylabel(ylabel, fontsize=16)
    x_hist_axes.set_ylabel('#observations', fontsize=16)
    y_hist_axes.set_xlabel('#observations', fontsize=16)
    plt.subplots_adjust( wspace=0.1, hspace=0.1)
    plt.show()
In [16]:
Markdown('''# Analysis of HPT file (Date %s)
The following notebook is based on the High Precision Timestamp file of ** %s**, also see the [Deutsche Börse membersection](https://member.deutsche-boerse.com/).
It demonstrates what kind of information can be extracted from the file.

It does NOT contain any other information than those contained in the HPT file.
If you want to map your own dataset then we suggest you take the MarketSegmentID + ExecID of your executions (or those that you missed) and map them to the corresponding dataset contained in the HPT file.

Note that not all of the executions have a corresponding network capture timestamp, as we only capture order entry traffic originating from the 10 GBit CoLo 2.0 network.

The time synchronization of the capturing devices is implemented using a white rabbit network in combination with pps breakouts. This leads to very stable and almost time drift free network level timestamps with an error of sub +/- 5 ns.

The timestamps taken by T7 applications and the gateway network card are time synched using a ptp network which leads to higher time drifts in the order of up to +/- 100 ns (see details below).

The HPT file contains all timestamps from the execution summary, i.e. 
* RequestTime (t_3n), 
* AggressorTime (t_5),
* ExecID        (t_7) and
* TransactTime  (t_9)
and additionally 
* ETICaptTime (t_3a) of the agressing order (if available) and
* EOBICaptTime (t_9d)

The following figure shows where the timestamps are taken:'''%(myd.strftime('%Y-%m-%d'),myd.strftime('%A %d %B %Y')))
Out[16]:

Analysis of HPT file (Date 2018-12-10)

The following notebook is based on the High Precision Timestamp file of Monday 10 December 2018, also see the Deutsche Börse membersection. It demonstrates what kind of information can be extracted from the file.

It does NOT contain any other information than those contained in the HPT file. If you want to map your own dataset then we suggest you take the MarketSegmentID + ExecID of your executions (or those that you missed) and map them to the corresponding dataset contained in the HPT file.

Note that not all of the executions have a corresponding network capture timestamp, as we only capture order entry traffic originating from the 10 GBit CoLo 2.0 network.

The time synchronization of the capturing devices is implemented using a white rabbit network in combination with pps breakouts. This leads to very stable and almost time drift free network level timestamps with an error of sub +/- 5 ns.

The timestamps taken by T7 applications and the gateway network card are time synched using a ptp network which leads to higher time drifts in the order of up to +/- 100 ns (see details below).

The HPT file contains all timestamps from the execution summary, i.e.

  • RequestTime (t_3n),
  • AggressorTime (t_5),
  • ExecID (t_7) and
  • TransactTime (t_9) and additionally
  • ETICaptTime (t_3a) of the agressing order (if available) and
  • EOBICaptTime (t_9d)

The following figure shows where the timestamps are taken:

In [17]:
# @hidden_cell
Image(filename="t7_network.png")
Out[17]:

Median network delays

We have measured the following median network delays

  • trading participants patch panel to middle of access layer tap at $t_{3a}$ or $t_{9a}$ : 911 ns
  • mid point tap at $t_{3a}$ to midpoint tap at $t_{3d}$: 936 ns
  • $t_{9d}$ to $t_{9a}$: 914 ns

    (1) $t_{9a} - t_{9d} = 914 ns = $ midpoint tap($t_{9d}$ to $t_{9a}$) $ + A_m - D_m $

  • minimum time between trigger market data $t_{9d}$ and $t_{3a}$ then becomes

    (2) $(t_{3a} - t_{9d})_{min} = 2\cdot911 ns + $ midpoint tap($t_{9d}$ to $t_{9a}$) $ - D_m + A $

    and with (1)

    (3) $(t_{3a} - t_{9d})_{min} = 2\cdot911 ns + 914 ns - A_m + A = 2736 ns + (A-A_m)$

    where $A$, $A_m$ and $D_m$ are the the mid tap to capture time delay at $t_{3a}$, $t_{9a}$ and $t_{9d}$ respectively.

    Cable length between taps and metamako timestamping devices are 2m at $t_{9a}$, 3m at $t_{9d}$, $t_{9a'}$, $t_{3d}$ and $t_{4d}$ and 5m at $t_{3a}$,$t_{3a'}$,$t_{4a}$ and $t_{4a'}$, so

    (4) $A-A_m = 15 ns$

    and

    (5) $(t_{3a} - t_{9d})_{min} = 2751 ns$

In [18]:
# read in hpt file
fileName = 'hpt_%s_complete.csv'%(myDate)
dfall = ReadFile(fileName)
In [19]:
CalculateTriggers(dfall)
TranslateProds(dfall)
CalculateLatency(dfall)
In [20]:
# filter out rows that don't have a ETICaptTime or RequestTime set
df=dfall[(dfall.RequestTime != 0) & (dfall.ETICaptTime != 0)]
In [21]:
# min and max of the day
minTime = min(df['RequestTime'])
maxTime = max(df['RequestTime'])
import pytz
cet=pytz.timezone('Europe/Berlin')
In [22]:
pylab.rcParams['figure.figsize'] = (20, 10)
tpl = dict()
hpl = dict()
binr = dict()
yr = dict()

# set y axis ranges
yr[0] = [df['t3a_t3n'].min(),df['t3a_t3n'].max()]
maxAbs = 150000
if yr[0][1] > maxAbs:
    yr[0][1] = maxAbs
if  yr[0][0] < -1 * maxAbs:
    yr[0][0] = -1 * maxAbs
yr[1] = [40000,yr[0][1] if yr[0][1] <= 80000 else 80000] 
yr[2] = [500,2500]


for i in [0,1,2]:
    if i == 0:
        tpl[i] = plt.subplot2grid((3, 5), (i, 0), rowspan=1, colspan=4)
    else:
        tpl[i] = plt.subplot2grid((3, 5), (i, 0), rowspan=1, colspan=4, sharex=tpl[0])
    hpl[i] = plt.subplot2grid((3, 5), (i, 4), colspan=1, sharey=tpl[i])
    hpl[i].set_xscale("log")
    tpl[i].set_ylim(yr[i][0],yr[i][1])
    binr[i] = np.arange(yr[i][0],yr[i][1],(yr[i][1]-yr[i][0])/200)



tpl[0].plot_date(df['dt'], df['t3a_t3n'], tz=cet, marker='.', markersize=1, mec='black')
hpl[0].hist(df['t3a_t3n'], orientation='horizontal', bins=binr[0], alpha=0.75, color='black')


for market in ['XETR','XEUR']:
    series=df['t3a_t3n'].where(df['market'] == market).dropna()
    times = df['dt'].where(df['market'] == market).dropna()
    for i in [1,2]:
        tpl[i].plot_date(times, series, tz=cet, label=marketNames[market], marker='.', ms=1, mec=marketColors['marker'][market],alpha=0.75)
        hpl[i].hist(series, orientation='horizontal', bins=binr[i], alpha=0.75, label=marketNames[market], color=marketColors['marker'][market])
#    tpl[2].plot_date(times, series, tz=cet, label=marketNames[market], marker='.', ms=1, mec=marketColors['marker'][market],alpha=0.75)
#    hpl[2].hist(series, orientation='horizontal', bins=binr[2], alpha=0.75, label=marketNames[market], color=marketColors['marker'][market])
for i in [1,2]:
    hpl[i].legend(markerscale=10)
    tpl[i].legend(markerscale=10)
    
for i in [0,1,2]:
    plt.setp(hpl[i].get_yticklabels(), visible=False)

plt.setp(tpl[0].get_xticklabels(), visible=False)
d= plt.setp(tpl[1].get_xticklabels(), visible=False)

Network transport times (Order Entry)

The plot below shows the network transport time t_3a to t_3n over the course of the day (all times are UTC). We can distinguish two latency domains, one above 40 µs and the other one below 4 µs.

The upper band consists of requests that are not optmized for latency. These are either routed to low frequency gateways or are routed to high frequency gateways on the other side (e.g. a requests sent via cross connect on side A targeting a gateway on side B).

There are also a few requests that have a negative network latency. This is an artefact of the way the solarflare wire order delivery api delivers timestamps. In case more than one request arrives on the same session within a very short time frame, the packets may be handed over to the application in one callback. This callback hands over the timestamp of the first message, and thus may lead to negative network latency.

In [23]:
plt.show()

Time synch and network dynamics

In the following we concentrate on the requests sent via optimal network connection that usually have a latency below 4 µs.

The network latency of these requests is influenced by three factors

  • time synchronisation between the time domains of the gateway (t_3n) and the network capture device (t_3a)
  • jitter
  • queuing effects

In order to show the time synch quality of each gateway individually the below plots are split by partition.

In [24]:
DrawPartitions('XEUR','t3a_t3n',minTime, maxTime, 0, t3a_t3n_lowmax,False)
In [25]:
DrawPartitions('XETR','t3a_t3n',minTime, maxTime, 0, t3a_t3n_lowmax,False)

Market Data

The measured latency of market data between the market data publisher (t_9) and receiver (t_11) can be broken down as follows

  • application to wire latency of the udp packet (t_9d - t_9)
  • physical transport times (cable)
  • jitter (switches)
  • queuing effects (rare collisions between different market data packets)

The application to wire latency also traverses time domains between the market data dissiminator (software timestamp) and network capture devices (synchronized via White Rabbit and pps). The following diagrams show the latency between the SendingTime (t_9) and distribution layer capture timestamp.

In [26]:
DrawPartitions('XEUR','t9_t9d',minTime, maxTime, 0,15000, False)
In [27]:
DrawPartitionHist(t9_t9d, emptySeries, np.arange(0,4000,10),'XEUR','Market Data Application to Wire latency', True)
In [28]:
from scipy.signal import argrelextrema, argrelmax
histmin=0
histmax = 4000
binwidth  = 10
for partition in [1]:
    hist, edges = np.histogram(t9_t9d[partition],  bins=np.arange(histmin,histmax,binwidth), normed = True)
    maxima = list(map(lambda x: (edges[x],hist[x]),argrelmax(hist, order=10)))
    #print(list(zip(maxima[0][0],maxima[0][1])))
    plt.bar(edges[:-1], hist, width=binwidth)
    for a in list(zip(maxima[0][0],maxima[0][1])):
        if (a[1] > 1E-04):
            plt.annotate('%s'%a[0], xy=a,xycoords='data', textcoords='offset points',xytext=(-15, 5))
    plt.show()
    #print('Partition %2d,  %d'%(partition,t9_t9d[partition].mode()[0]))

Time synch and network dynamics - time drift

A two hour closup shows that the different gateways exibit different levels of time drift. Some gateways exibit a regular wave like drift of varying magnitude, while others exhibit an irregular drift.

We use a rolling median to approximate the time drift at any given point in time. A static windowsize was chosen for simplicity. The size was chosen such that the confidence interval of the first and third quartile is minimal.

Queuing effects lead to varying network delays (the spots above the 'base' delay).

In [29]:
DrawPartitions('XEUR','t3a_t3n',(minTime + maxTime )/ 2 - 3600E09, (minTime + maxTime) / 2  + 3600E09, 0, t3a_t3n_lowmax,True)
In [30]:
DrawPartitions('XETR','t3a_t3n',(minTime + maxTime )/ 2 - 3600E09, (minTime + maxTime) / 2  + 3600E09, 0, t3a_t3n_lowmax,True)

Time synch and network dynamics - jitter and network queuing

With the approximation of the time drift by a rolling median, there is some residual jitter left.
The below charts show the residuals when subtracting the rolling median.
There is a band around 0, these are requests that traverse the network without queuing and makae up the vast majority.
Requests with a residual t_3a to t_3n latency of more than 50 ns are subject to network queuing in most cases.

In [31]:
DrawPartitionSeries('XEUR',t3a_t3n_residual,minTime, maxTime, -200,200 )

Time synch and network dynamics - jitter

The deviation from rolling median for those requests that are not queued in the network is due to jitter in timestamp taking and in the network.
The below shows this jitter to be of the magnitude +/- 8 ns (confidence interval first to third quartile).

In [32]:
DrawPartitionHist(t3a_t3n_residual, emptySeries, np.arange(-50,50,1),'XEUR','Network delay residuals', True)

Network dynamics - queuing

Besides the time drift and jitter queuing does occur in the network when two requests are aimed at the same target and the request arriving earlier blocks the output port of the switch.

There are two points in the network were this queuing may occur:

  1. Access layer switch, i.e. requests arriving from different cross connects that target the same distribution layer switch
  2. Distribution layer switch, i.e. requests from different access layer switches that target the same gateway

The below analysis will touch upon these aspects.

Measuring reaction time

Assuming a preceeding execution 'triggers' the next next trade we can calculate the reation time between the receive time of an executing order and the preceeding execution.
The expected order entry time of a request triggered by an execution can be calculated as

\begin{equation} t\_3a_n = t\_9d_{n-1} + reactionTime \end{equation}

and thus \begin{equation} reactionTime = t_3a_n - t_9d_{n-1} \end{equation} where \begin{equation} t_3a_n > t_9d_{n-1} + minNetworkDelay \end{equation}

The 'minimum network delay' has the following components

  • cable delay between participants installation and access layer switch of ~ 900 ns each way
  • network delay between access layer switch and the measurement point in question, i.e.
    • t_9 to t_9d (median latency 1500 - 1600 ns, see below)
    • t_9d to t_9a (median latency 915 ns, information not included in HPT file)

Below we show how the HPT file can help to compare reaction times on nanosecond level, as the timestamps t_3a and t_9d are subject to less jitter and exclude any network queueing of the incoming requests. We start by looking at reaction times based only on timestamps contained in the EOBI protocol.

Measuring reaction time (based on t_3n and t_9)

The EOBI protocol contains the gateway order entry time t_3n of the aggressing order (RequestTime) and the application level send time (TransactTime) of the EOBI packet for every execution.

The below plot shows the distribution of the reaction time based on t_3n and t_9, corrected by the median latency t_3a to t_3n and t_9 to t_9d for later comparison.

In [33]:
DrawPartitions('XEUR','t9_t3n', minTime, maxTime,0, 2000000, False, False)
In [34]:
DrawPartitionHistSingle(3,t9d_t3n_corr, emptySeries,2600,4000,5, market, 't9d_t3n_corr','t9d_t3n_corr','5', False)
In [35]:
DrawPartitionHistSingle(3,t9d_t3a_estimate, t9d_t3n_corr,2600,4001,1, market, 't9d_t3n (time synch corrected)','t9d_t3a_estimate','t9d_t3n_corr', False)
DrawPartitionHistSingle(3,t9d_t3a, t9d_t3a_estimate,2600,4001,1, market, 't9d_t3n (time synch corrected)','t9d_t3a','t9d_t3a_estimate', False)
In [36]:
market='XEUR'
DrawPartitionHist(t9d_t3a_median, emptySeries,np.arange(0,25000,100), market, 'Reaction time t_9 to t_3 (%s)\nCorrected by median t9=>t9d (%d ns) and t3a=>t3n (%d ns)'%(marketNames[market],t9dCorr[market],t3nCorr[market]), False)
In [37]:
market='XEUR'
DrawPartitionHist(t9d_t3a_median, emptySeries,np.arange(2000,4000,10), market, 'Reaction time t_9 to t_3 closeup (%s)\nCorrected by median t9=>t9d (%d ns) and t3a=>t3n (%d ns)'%(marketNames[market],t9dCorr[market],t3nCorr[market]), False)
In [38]:
# market='XETR'
# DrawPartitionHist(t9d_t3a_median, emptySeries, binrange, market, 'Reaction time t_9 to t_3 for %s\nCorrected by median t9=>t9d (%d ns) and t3a=>t3n (%d ns)'%(market,t9dCorr[market],t3nCorr[market]), False)

Measuring reaction time (using network timestamp t_9d)

Using the network level timestamp t_9d instead of t_9 eliminates the jitter introduced by the network card and the time drift between t9 and t_9d.
The below plot shows the distribution of the reaction time based on t_3n and t_9d, corrected by the median latency t_3a to t_3n. The distribution is a bit narrower as the original distribution as expected.

In [39]:
market='XEUR'
DrawPartitionHist(t9d_t3n_corr, t9d_t3a_median, np.arange(2000,4000,10), market, 'Reaction time t_9d to t_3n (%s)\nCorrected by median t3a=>t3n (%d ns)'%(market,t3nCorr[market]), False)

Measuring reaction time (based on t3n taking into account time drift)

When taking into account the dynamic part of the time drift using a rolling window median the latency histogram reveals much more structure.

In [40]:
market='XEUR'
DrawPartitionHist(t9d_t3a_estimate, t9d_t3n_corr, np.arange(2500,4000,10), market, 'Reaction time t_9d to t_3n (%s)\nCorrected by rolling window median t3a=>t3n '%(market), False)

Measuring reaction time (based on t3a)

Using ETICaptTime (t_3a) from the HPT file the reaction time histogram reveals the real reaction time, not being influenced by time drift and more importantly by network queuing and jitter.
The distribution is not only sharper, it also reveals reaction times not contained in the distributions based on t_3n.

In [41]:
market='XEUR'
DrawPartitionHist(t9d_t3a, t9d_t3a_estimate, np.arange(2500,3500,2), market, '%s\nSpectrum of measured t_3a - t_9d'%(market), False)
In [42]:
partition=1
for partition in [1,2,3,4,50,58]:
    DrawScatter(t9d_t3a[partition], 
            t9d_t3a_estimate[partition], 
            "Reaction time t9d to t3a (ns)", 
            "Estimated reaction time based on t3n and median rolling delay t3a to t3n (ns)", 
            2750, 3200, 
            '\nPartition %d\nreaction time (estimated) vs reaction time (real)'%(partition))

T7 application latency aspects

Below we compare the T7 application latencies for aggressing orders with fast trigger reaction times (< 5 µs) versus those with higher reaction times. We plot the paths gateway to matcher, matcher pre execution and matcher execution to market data out. The higher T7 latencies for the faster reaction times indicates higher load at that times, i.e. more competition.

In [43]:
def AnnotateHist(subPlot, series, binrange):
    hist, edges = np.histogram(series,  bins=binrange)
    maxima = list(map(lambda x: (edges[x],hist[x]),argrelmax(hist, order=15)))
    maxHist  = max(maxima[0][1])
    for a in list(zip(maxima[0][0],maxima[0][1])):
        if (a[1] > maxHist /2):
            subPlot.annotate('%s'%(a[0]), xy=a,xycoords='data', textcoords='offset points',xytext=(-15,5))
In [44]:
marketSegmentId = 589
partition = 1
pylab.rcParams['figure.figsize'] = (20, 8)
lowmax = 5000
condition= ( df['PartitionID'] == partition ) & ( df['MarketSegmentID'] == marketSegmentId ) & ( df['t9d_t3a'] < lowmax )

binrangeapp = np.arange(0,100000,100)
plt.title('Market Segment %d (Partition %d): T7 Application latency for fast reactions' %(marketSegmentId, partition))
plt.xlabel("Latency (ns)")
plt.ylabel("#observations")
plt.xlim([0,100000])
n,a,b = plt.hist(df['t3n_t5'][condition].dropna(), bins=binrangeapp,color=plotColors[0], alpha=0.75, label='t_3n to t_5')
n,a,b = plt.hist(df['t5_t7'][condition].dropna(), bins=binrangeapp,color=plotColors[2], alpha=0.75, label='t_5 to t_7')
n,a,b = plt.hist(df['t7_t9'][condition].dropna(), bins=binrangeapp,color=plotColors[1], alpha=0.75, label='t7_ to t_9')
AnnotateHist(plt,df['t3n_t5'][condition].dropna(),binrangeapp)
AnnotateHist(plt,df['t5_t7'][condition].dropna(),binrangeapp)
AnnotateHist(plt,df['t7_t9'][condition].dropna(),binrangeapp)
plt.legend()
plt.show()
In [45]:
condition=(df['PartitionID'] == partition) & (df['MarketSegmentID'] == marketSegmentId) & (df['t9d_t3a'] >= lowmax)

pylab.rcParams['figure.figsize'] = (20, 8)
ts     = df['RequestTime'][condition].dropna()
meIn   = df['AggressorTime'][condition].dropna()
execID = df['ExecID'][condition].dropna()
mdOut  = df['TransactTime'][condition].dropna()
    
# 
plt.title('Market Segment %d (Partition %d): T7 Application latency for slower reactions' %(marketSegmentId, partition))
plt.xlabel("Latency (ns)")
plt.ylabel("#observations")
plt.xlim([0,100000])

n,a,b = plt.hist(meIn- ts, bins=binrangeapp,color=plotColors[0], alpha=0.75, label='t_3n to t_5')
n,a,b = plt.hist(execID- meIn, bins=binrangeapp,color=plotColors[2], alpha=0.75, label='t_5 to t_7')
n,a,b = plt.hist(mdOut- execID, bins=binrangeapp,color=plotColors[1], alpha=0.75, label='t7_ to t_9')
AnnotateHist(plt,meIn-ts,binrangeapp)
AnnotateHist(plt,execID- meIn,binrangeapp)
AnnotateHist(plt,mdOut- execID,binrangeapp)
plt.legend()
plt.show()

Triggering trades

The below gives the correlation matrix between different products, i.e. trades of products in rows trigger prompt executions in products in the columns

In [46]:
condition = (df['t9d_t3a'] < 4000) & (df['t9d_t3a'] > 2750) 
t=pd.DataFrame(df[condition].groupby(by=['triggerProd','prod']).size())

%matplotlib inline
fig, ax = plt.subplots(figsize=(50,20))
a= t.unstack(fill_value=0)[0].sort_index(axis=1).sort_index(axis=0)
triggerMax = 0
threshold = 200
for s in a.columns:
    if (a[s].max() > triggerMax):
        triggerMax = a[s].max()
    if a[s].sum() < threshold:
        a=a.drop(s, axis=1)
for s in a.index:
    if a.loc[s].sum() < threshold and not (s in a.columns) :
        a=a.drop(s, axis =0)
b= pd.DataFrame(a)
a= a.where(a>0).fillna(1).astype(int)
hm=sns.heatmap(a, norm=LogNorm(1,triggerMax), cbar_kws={"ticks":[0,1,10,1e2,1e3,1e4,1e5]}, cmap='RdYlGn_r', linewidths=0.0, annot=b, annot_kws={"size": 15, 'color' : 'black'}, fmt="d", ax=ax) 
for label in hm.get_yticklabels():
    label.set_size(15)
    label.set_weight("bold")
    label.set_rotation('horizontal')
    if label.get_text() in ["XEUR-FESX", 'XEUR-FDAX']:
        label.set_color("red")
for label in hm.get_xticklabels():
    label.set_size(15)
    label.set_weight("bold")
    label.set_rotation('vertical')
    if label.get_text() in ["XEUR-FESX", 'XEUR-FDAX']:
        label.set_color("red")

Questions or Comments

In case of questions or comments please do not hesitate to contact us at monitoring@deutsche-boerse.com or Sebastian.Neusuess@deutsche-boerse.com.

Disclaimer

© Deutsche Börse 2018 Deutsche Börse AG (DBAG), Clearstream Banking AG (Clearstream), Eurex Frankfurt AG, Eurex Clearing AG (Eurex Clearing) as well as Eurex Bonds GmbH (Eurex Bonds) and Eurex Repo GmbH (Eurex Repo) are corporate entities and are registered under German law. Eurex Zürich AG is a corporate entity and is registered under Swiss law. Clearstream Banking S.A. is a corporate entity and is registered under Luxembourg law. U.S. Exchange Holdings, Inc. is a corporate entity and is registered under U.S. American law. Deutsche Boerse Asia Holding Pte. Ltd., Eurex Clearing Asia Pte. Ltd. and Eurex Exchange Asia Pte. Ltd are corporate entities and are registered under Singapore law. Eurex Frankfurt AG (Eurex) is the administrating and operating institution of Eurex Deutschland. Eurex Deutschland and Eurex Zürich AG are in the following referred to as the “Eurex Exchanges”. All intellectual property, proprietary and other rights and interests in this publication and the subject matter hereof (other than certain trademarks and service marks listed below) are owned by DBAG and its affiliates and subsidiaries including, without limitation, all patent, registered design, copyright, trademark and service mark rights. While reasonable care has been taken in the preparation of this publication to provide details that are accurate and not misleading at the time of publication DBAG, Clearstream, Eurex, Eurex Clearing, Eurex Bonds, Eurex Repo as well as the Eurex Exchanges and their respective servants and agents (a) do not make any representations or warranties regarding the information contained herein, whether express or implied, including without limitation any implied warranty of merchantability or fitness for a particular purpose or any warranty with respect to the accuracy, correctness, quality, completeness or timeliness of such information, and (b) shall not be responsible or liable for any third party’s use of any information contained herein under any circumstances, including, without limitation, in connection with actual trading or otherwise or for any errors or omissions contained in this publication. This publication is published for information purposes only and shall not constitute investment advice respectively does not constitute an offer, solicitation or recommendation to acquire or dispose of any investment or to engage in any other transaction. This publication is not intended for solicitation purposes but only for use as general information. All descriptions, examples and calculations contained in this publication are for illustrative purposes only. Eurex and Eurex Clearing offer services directly to members of the Eurex exchanges respectively to clearing members of Eurex Clearing. Those who desire to trade any products available on the Eurex market or who desire to offer and sell any such products to others or who desire to possess a clearing license of Eurex Clearing in order to participate in the clearing process provided by Eurex Clearing, should consider legal and regulatory requirements of those jurisdictions relevant to them, as well as the risks associated with such products, before doing so. Eurex derivatives are currently not available for offer, sale or trading in the United States or by United States persons (other than EURO STOXX 50® Index Futures, EURO STOXX 50® ex Financials Index Futures, EURO STOXX® Select Dividend 30 Index Futures, EURO STOXX® Index Futures, EURO STOXX® Large/Mid/Small Index Futures, STOXX® Europe 50 Index Futures, STOXX® Europe 600 Index Futures, STOXX® Europe 600 Banks/Industrial Goods & Services/Insurance/Media/Travel & Leisure/Utilities Futures, STOXX® Europe Large/Mid/Small 200 Index Futures, Dow Jones Global Titans 50 IndexSM Futures (EUR & USD), DAX®/Mini-DAX®/MDAX®/TecDAX® Futures, SMIM® Futures, SLI Swiss Leader Index® Futures, MSCI World (FMWO, FMWP, FMWN)/Europe (FMEU, FMEP)/ Europe Value/Europe Growth/Emerging Markets (FMEM, FMEF, FMEN)/Emerging Markets Latin America/Emerging Markets EMEA/Emerging Markets Asia/China Free/India/Japan/Malaysia/South Africa/Thailand/AC Asia Pacific ex Japan Index Futures, TA-25 Index Futures, Daily Futures on TAIEX Futures, VSTOXX® Futures, Gold and Silver Futures as well as Eurex FX, property and interest rate derivatives). Trademarks and Service Marks Buxl®, DAX®, DivDAX®, eb.rexx®, Eurex®, Eurex Bonds®, Eurex Repo®, Eurex Strategy WizardSM, Euro GC Pooling®, FDAX®, FWB®, GC Pooling®,,GCPI®, MDAX®, ODAX®, SDAX®, TecDAX®, USD GC Pooling®, VDAX®, VDAX-NEW® and Xetra® are registered trademarks of DBAG. All MSCI indexes are service marks and the exclusive property of MSCI Barra. ATX®, ATX® five, CECE® and RDX® are registered trademarks of Vienna Stock Exchange AG. IPD® UK Annual All Property Index is a registered trademark of Investment Property Databank Ltd. IPD and has been licensed for the use by Eurex for derivatives. SLI®, SMI® and SMIM® are registered trademarks of SIX Swiss Exchange AG. The STOXX® indexes, the data included therein and the trademarks used in the index names are the intellectual property of STOXX Limited and/or its licensors Eurex derivatives based on the STOXX® indexes are in no way sponsored, endorsed, sold or promoted by STOXX and its licensors and neither STOXX nor its licensors shall have any liability with respect thereto. Dow Jones is a service mark of Dow Jones & Company, Inc. All derivatives based on these indexes are not sponsored, endorsed, sold or promoted by Dow Jones & Company, Inc. Dow Jones & Company, Inc. does not make any representation regarding the advisability of trading or of investing in such products. Bloomberg Commodity IndexSM and any related sub-indexes are service marks of Bloomberg L.P. All references to London Gold and Silver Fixing prices are used with the permission of The London Gold Market Fixing Limited as well as The London Silver Market Fixing Limited, which for the avoidance of doubt has no involvement with and accepts no responsibility whatsoever for the underlying product to which the Fixing prices may be referenced. PCS® and Property Claim Services® are registered trademarks of ISO Services, Inc. Korea Exchange, KRX, KOSPI and KOSPI 200 are registered trademarks of Korea Exchange Inc. Taiwan Futures Exchange and TAIFEX are registered trademarks of Taiwan Futures Exchange Corporation. Taiwan Stock Exchange, TWSE and TAIEX are the registered trademarks of Taiwan Stock Exchange Corporation. BSE and SENSEX are trademarks/service marks of Bombay Stock Exchange (BSE) and all rights accruing from the same, statutory or otherwise, wholly vest with BSE. Any violation of the above would constitute an offence under the laws of India and international treaties governing the same. The names of other companies and third party products may be trademarks or service marks of their respective owners.

In [47]:
HTML('''Code toggle: click <a href="javascript:code_toggle()">here</a>.''')
Out[47]:
Code toggle: click here.
In [48]:
# assign true/false depending on eti capture time being there
dfall['hasETITime'] = dfall['ETICaptTime']>0
# group by MarketSegmentID
grouped = dfall.groupby(by=['MarketSegmentID'])
# group by MarketSegmentID & eti existence
groupedeti = dfall.groupby(by=['MarketSegmentID','hasETITime'])
In [49]:
# a gives you the top 20 products by count
a = grouped.size().sort_values(ascending=False).head(n=20)
# b gives you the number of messages with ETICaptTime > 0 by product
b = dfall[dfall['ETICaptTime']>0].groupby('MarketSegmentID').size().sort_values(ascending=False)
# complete new dataframe with the same index as the top 20 products
matchRate = pd.DataFrame(index = a.index, columns=['mr','cnt','eti'])
# a and b have MarketSegmentID as index
In [50]:
print('Top 20 match ratios')
print('MarketSegment Cnt Eti Ratio')
# for each value in matchRate df assign number of messages, number of messages with ETICaptTime > 0 and Ratio
for i,row in matchRate.iterrows():
    row['eti'] = b[i]
    row['cnt'] = a[i]
    row['mr'] =(100.0 * b[i])/(1.0 * a[i])
    print('%s\t%d\t%d\t%.2f%%'%(i,row['cnt'], row['eti'], row['mr']))
Top 20 match ratios
MarketSegment Cnt Eti Ratio
1355	119738	91553	76.46%
589	84996	58611	68.96%
675	58591	42720	72.91%
688	45917	31725	69.09%
48067	34238	25135	73.41%
757	25830	21757	84.23%
52570	19076	11873	62.24%
749	16373	9741	59.49%
52571	16248	11946	73.52%
691	14191	11942	84.15%
52880	13754	9547	69.41%
689	13534	8224	60.77%
52795	13088	9197	70.27%
636	12934	8086	62.52%
52983	11522	8257	71.66%
52984	9323	7049	75.61%
52995	9165	6614	72.17%
816	8346	7508	89.96%
53040	7919	5483	69.24%
690	7909	2104	26.60%
In [51]: