Source code for clustering

#!/usr/bin/env python
# encoding: utf-8
"""
Provides classes and functions for clustering of earthquakes based on the
depth-first algorithm.
"""

import os
import glob
import matplotlib.pyplot as plt
import numpy as np
from CZ_color import CZ_Clust_2_color, CZ_W_2_color
from OP_waveforms import Waveform
import logging
from correlation import BinaryFile
from locations_trigger import read_locs_from_file
from NllGridLib import read_stations_file


[docs]class Graph(object): """ Class of Graph objects. Contains 4 attributes: ** Attributes ** .. attribute:: flag Indicates if an event already belongs to a cluster (1) or not (0). .. attribute:: cluster_index Indicates the cluster number of all events. 0 means the event does not belong to any cluster. .. attribute:: neighbours Lists the indexes of the neighbour events for each event. GRAPH.neighbours is then a list of lists. """ def __init__(self): """ Initialises all attributes to empty lists [] """ self.flag = [] self.cluster_index = [] self.neighbours = []
[docs] def set_flag(self, value): """ Appends value to flag attribute. :param value: Value to append (0=False, 1=True). :type value: integer """ self.flag.append(value)
[docs] def set_cluster_index(self, value): """ Appends value to cluster_index attribute :param value: Value to append. :type value: integer """ self.cluster_index.append(value)
[docs] def set_neighbours(self, value): """ Appends value to neighbours attribute :param value: Value to append (list of integers) :type value: list """ self.neighbours.append(value)
[docs]def DFS(GRAPH, summit_first, cluster_ind): """ Implementation of the depth first search (DFS) algorithm. This is a recursive algorithm. See detailed description in ``do_clustering function``. :param GRAPH: the algorithm updates GRAPH.flag and GRAPH.cluster_index :param summit_first: event index by which the research of neighbours begins :param cluster_ind: cluster index :type GRAPH: Graph object :type summit_first: integer :type cluster_ind: integer """ GRAPH.flag[summit_first] = 1 GRAPH.cluster_index[summit_first] = cluster_ind for ind_summit_fils in range(len(GRAPH.neighbours[summit_first])): summit_fils = GRAPH.neighbours[summit_first][ind_summit_fils] if GRAPH.flag[summit_fils] == 0: cluster_ind = DFS(GRAPH, summit_fils, cluster_ind) return cluster_ind
[docs]def waveval(stack_time, t_before, t_after, dt, tdeb): """ Finds the indexes corresponding to t_before and t_after :param stack_time: origin time of the event :param t_before: time taken before the origin time :param t_after: time taken after the origin time :param dt: waveform sample distance in seconds :param tdeb: start time of the waveform :type stack_time: UTCDateTime :type t_before: float :type t_after: float :type dt: float :type tdeb: UTCDateTime :rtype: integer :returns: indexes corresponding to t_before and t_after """ tstart = stack_time-t_before-tdeb tend = stack_time+t_after-tdeb i_start = int(round(tstart*1./dt)) i_end = int(round(tend*1./dt)) return i_start, i_end
[docs]def plot_traces(CLUSTER, delay_file, coeff, locs, datadir, data_files, threshold): """ Plots the waveforms of all possible event pairs within a cluster. On the same figure, displays the superimposed waveforms of the event pair for all stations. Also displays the correlation value. :param CLUSTER: dictionary containing the event indexes belonging to each cluster :param delay_file: file name of the file containing the time delays :param coeff: cross-correlation values of all possible event pairs for all stations :param locs: list of the whole Waveloc locations (each element of the list is a dictionary) :param datadir: data directory path :param data_files: list of paths of data files :param threshold: correlation coefficient threshold :type CLUSTER: dictionary :type delay_file: string :type coeff: dictionary :type locs: list :type datadir: string :type data_files: list :type threshold: float """ # Read the file containing the time delays a = BinaryFile(delay_file) delay = a.read_binary_file() t_before = 0.5 t_after = 6.0 tr = {} for data_file in data_files: wf = Waveform() wf.read_from_file(data_file) tr[wf.station] = wf.values dt = wf.delta tdeb = wf.starttime list_name = sorted(tr) for i in range(1, len(CLUSTER)+1): # cluster index for j in range(len(CLUSTER[i])): # first event index e1 = CLUSTER[i][j] for k in range(j+1, len(CLUSTER[i])): # second event index e2 = CLUSTER[i][k] co = 0 fig = plt.figure() fig.set_facecolor('white') for l in range(len(list_name)): name = list_name[l] if delay[name][e1-1][e2-1] != 'NaN': stack_time_1 = locs[e1-1]['o_time'] i_start_1, i_end_1 = waveval(stack_time_1, t_before, t_after, dt, tdeb) val1 = tr[name][i_start_1-1:i_end_1] stack_time_2 = locs[e2-1]['o_time'] i_start_2, i_end_2 = \ waveval(stack_time_2-delay[name][e1-1][e2-1], t_before, t_after, dt, tdeb) val2 = tr[name][i_start_2-1:i_end_2] t = np.linspace(0, t_after+t_before, (t_after+t_before)/dt+1) ax = fig.add_subplot(len(list_name), 1, l+1) ax.set_axis_off() ax.plot(t, val1/max(val1), 'k') ax.plot(t, val2/max(val2), 'y--') c = 'k' if coeff[name][e1-1][e2-1] >= threshold: co = co+1 c = 'r' ax.text(0.2, 0.5, "%s, %s, %s" % (name, str(coeff[name][e1-1][e2-1]), str(delay[name][e1-1][e2-1])), color=c) fig.suptitle("Cluster : %s ; Event pair : (%s,%s) ; %d" % (str(i), str(e1), str(e2), co)) plt.show()
[docs]def compute_nbsta(event, coeff, threshold): """ Computes the number of stations where the correlation value is greater than the threshold for every event pair :param event: total number of events located by Waveloc :param coeff: cross-correlation coefficients of all possible event pairs for all stations :param threshold: correlation value threshold set to form a cluster :type event: integer :type coeff: dictionary :type threshold: float :rtype: numpy matrix containing integers :returns: matrix of number of stations """ nbsta = [] for i in xrange(event): liste = [] for k in xrange(i): liste.append(0) for j in xrange(i, event): c = 0 if i != j: for name in sorted(coeff): if coeff[name] and coeff[name][i][j] >= threshold and \ coeff[name][i][j] != 'NaN': c = c+1 liste.append(c) else: liste.append(0) nbsta.append(liste) nbsta = np.matrix(nbsta) return nbsta
[docs]def do_clustering(event, nbsta, nbmin): """ First step: all events are examined one by one. For each of them, the indexes of the events where there is a sufficient number of stations (i.e. the number exceeds or equals nbmin) are kept in GRAPH.neighbours. All events are flagged to 0 and belong to cluster 0 by default. The indexes of the events for which neighbours were found are written in the list ``summits``. From this list (summits), we extract the event which has the greatest number of neighbours. The clustering process will start with that event. Then the DFS algorithm is applied: when an event is examined, it is flagged to 1 and its cluster number is also updated (GRAPH.cluster_index). The DFS algorithm is recursive and explores each possible path until it ends: it means that when the research starts with a given event, it will search for the the first neighbour, and then the first neighbour of this first neighbour and so on... When all the ``first neighbours`` are found, the research can concentrates on second neighbours, and so on until all the neighbours are found. Once the DFS algorithm has found all events linked to summit_first, we write the corresponding indexes into the dictionary CLUSTER (where the keys are the cluster indexes) and look for events with neighbours which are still not flagged to 1. The process keeps going until the whole events with neighbours belong to a cluster. The function finally returns the dictionary CLUSTER. :param event: total number of events in the Waveloc location file :param nbsta: 2-D matrix containing the number of stations where the cross-correlation value is greater than a given threshold for all possible event pairs :param nbmin: minimum number of stations where the cross-correlation value should be greater than a given threshold to form a cluster :type event: integer :type nbsta: matrix :type nbmin: integer :rtype: dictionary :returns: indexes of events forming each cluster """ # CODE CHRISTOPHE - CLUSTERING : DEPTH FIRST SEARCH neighbours_of_I_summit__horiz = [] neighbours_of_I_summit__verti = [] GRAPH = Graph() summits = [] for I in range(event): neighbours_of_I_summit__verti = \ (np.where(nbsta[:, I] >= nbmin)[0]).tolist()[0] neighbours_of_I_summit__horiz = \ (np.where(nbsta[I, :] >= nbmin)[1]).tolist()[0] GRAPH.set_neighbours(neighbours_of_I_summit__verti + neighbours_of_I_summit__horiz) GRAPH.set_flag(0) GRAPH.set_cluster_index(0) if neighbours_of_I_summit__verti + neighbours_of_I_summit__horiz: summits.append(I) if summits: nb_max_neighbours = 0 for ind_summit in range(len(summits)): l = len(GRAPH.neighbours[summits[ind_summit]]) if l > nb_max_neighbours: summit_first = summits[ind_summit] nb_max_neighbours = l ind_summit_first = 0 cluster_ind = 0 CLUSTER = {} while 1: event_index_flagged = [] event_index_non_flagged_with_neighbours = [] ind_summit_first = ind_summit_first+1 cluster_ind = cluster_ind+1 cluster_ind = DFS(GRAPH, summit_first, cluster_ind) for k in range(len(GRAPH.neighbours)): if GRAPH.flag[k] == 1 and \ GRAPH.cluster_index[k] == ind_summit_first: event_index_flagged.append(k) elif GRAPH.flag[k] == 0 and len(GRAPH.neighbours[k]) != 0: event_index_non_flagged_with_neighbours.append(k) # add 1 to each event number as the first one is number one (and # not zero) CLUSTER[cluster_ind] = list(event_index_flagged + np.ones(len(event_index_flagged), dtype=np.int)) if len(event_index_non_flagged_with_neighbours) > 1: summit_first = event_index_non_flagged_with_neighbours[0] else: break return CLUSTER else: return {}
[docs]def plot_graphs(locs, stations, nbsta, CLUSTER, nbmin, threshold): """ Displays two figures. On the first plot, all events are represented. Those belonging to a cluster are color-coded and labelled with the cluster index. On the second plot, all events are also represented, but those belonging to a cluster are colored in black. The links between events of a same cluster are also plotted and color-coded in function of the number of stations where the correlation value exceeds the threshold. Uses mlab from mayavi :param locs: list of the whole Waveloc locations (each element of the list is a dictionary) :param stations: dictionary of stations :param nbsta:2-D matrix containing the number of stations where the cross-correlation value is greater than a given threshold for all possible event pairs :param CLUSTER: dictionary containing the indexes of the events for each cluster :param nbmin: minimum number of stations where the cross-correlation value should be greater than a given threshold to form a cluster :param threshold: correlation value threshold set to form a cluster :type locs: dictionary :type stations: dictionary :type nbsta: matrix :type CLUSTER: dictionary :type nbmin: integer :type threshold: float """ from mayavi import mlab # Event coordinates stack_x, stack_y, stack_z = [], [], [] for loc in locs: stack_x.append(loc['x_mean']) stack_y.append(loc['y_mean']) stack_z.append(-loc['z_mean']) # Extract coordinates xsta, ysta, zsta = [], [], [] for sta in sorted(stations): xsta.append(stations[sta]['x']) ysta.append(stations[sta]['y']) zsta.append(stations[sta]['elev']) # 3D PLOT USING MAYAVI logging.info("Plotting...") mlab.figure(1, bgcolor=(1, 1, 1), fgcolor=(0, 0, 0), size=(1000, 900)) mlab.points3d(xsta, ysta, zsta, color=(1, 0, 0), scale_factor=0.05, mode='cube') mlab.axes(extent=[362, 370, 7647, 7653, -0.5, 2.5], color=(0, 0, 0)) mlab.outline(extent=[362, 370, 7647, 7653, -0.5, 2.5], color=(0, 0, 0)) mlab.points3d(stack_x, stack_y, stack_z, scale_factor=0.1, color=(0.8, 0.8, 0.8)) mlab.title("threshold=%s, nbmin=%s" % (threshold, nbmin), height=0.1, size=0.35, color=(0, 0, 0)) for i_ev in range(len(nbsta)): for i_c in range(1, len(CLUSTER)+1): if i_ev+1 in CLUSTER[i_c]: mlab.points3d(stack_x[i_ev], stack_y[i_ev], stack_z[i_ev], scale_factor=0.1, color=tuple(CZ_Clust_2_color(100 * (len(CLUSTER)-i_c) / len(CLUSTER)))) mlab.text3d(stack_x[i_ev], stack_y[i_ev], stack_z[i_ev], str(i_c), color=(0, 0, 0), scale=0.1) logging.info("Done!") logging.info("Plotting...") mlab.figure(2, bgcolor=(1, 1, 1), fgcolor=(0, 0, 0), size=(1000, 900)) mlab.points3d(xsta, ysta, zsta, color=(1, 0, 0), scale_factor=0.05, mode='cube') mlab.axes(extent=[362, 370, 7647, 7653, -0.5, 2.5], color=(0, 0, 0)) mlab.outline(extent=[362, 370, 7647, 7653, -0.5, 2.5], color=(0, 0, 0)) mlab.points3d(stack_x, stack_y, stack_z, scale_factor=0.1, color=(0.8, 0.8, 0.8)) mlab.title("threshold=%s, nbmin=%s" % (threshold, nbmin), height=0.1, size=0.35, color=(0, 0, 0)) print nbsta for ind_I in range(len(nbsta)): for ind_J in range(ind_I+1, len(nbsta)): W_IJ = nbsta[ind_I, ind_J] if W_IJ >= nbmin: mlab.points3d(stack_x[ind_J], stack_y[ind_J], stack_z[ind_J], scale_factor=0.1, color=(0, 0, 0)) mlab.points3d(stack_x[ind_I], stack_y[ind_I], stack_z[ind_I], scale_factor=0.1, color=(0, 0, 0)) d = (stack_x[ind_J]-stack_x[ind_I], stack_y[ind_J]-stack_y[ind_I], stack_z[ind_J]-stack_z[ind_I]) norm = np.sqrt(d[0]**2+d[1]**2+d[2]**2) mlab.quiver3d(stack_x[ind_I], stack_y[ind_I], stack_z[ind_I], d[0], d[1], d[2], color=tuple(CZ_W_2_color(W_IJ)), mode='2ddash', scale_factor=norm, scale_mode='scalar') logging.info("Done!") mlab.show()
[docs]def do_clustering_setup_and_run(opdict): """ Does clustering by applying the depth first search algorithm and saves the result (= a dictionary containing the event indexes forming each cluster) in a binary file. Needs to define the correlation value threshold and the minimum number of stations where this threshold should be reached to form a cluster (should be done in the options dictionary) :param opdict: Dictionary of waveloc options """ base_path = opdict['base_path'] verbose = opdict['verbose'] # stations stations_filename = os.path.join(base_path, 'lib', opdict['stations']) # data data_dir = os.path.join(base_path, 'data', opdict['datadir']) data_glob = opdict['dataglob'] data_files = glob.glob(os.path.join(data_dir, data_glob)) data_files.sort() # location file locdir = os.path.join(base_path, 'out', opdict['outdir'], 'loc') loc_filename = os.path.join(locdir, 'locations.dat') # file containing correlation values coeff_file = os.path.join(locdir, opdict['xcorr_corr']) # Read correlation values b = BinaryFile(coeff_file) coeff = b.read_binary_file() # INPUT PARAMETERS nbmin = int(opdict['nbsta']) if nbmin > len(coeff.keys()): raise Exception('the minimum number of stations cannot be > to the\ number of stations !!') event = len(coeff.values()[0]) tplot = float(opdict['clus']) # threshold for which we save and plot cluster_file = "%s/cluster-%s-%s" % (locdir, str(tplot), str(nbmin)) corr = [opdict['clus']] #corr = np.arange(0, 1.1, 0.1) for threshold in corr: threshold = float(threshold) nbsta = compute_nbsta(event, coeff, threshold) CLUSTER = do_clustering(event, nbsta, nbmin) if threshold == tplot: print "----------------------------------------------" print "THRESHOLD : ", threshold, " # STATIONS : ", nbmin print "# CLUSTERS : ", len(CLUSTER) print CLUSTER c = BinaryFile(cluster_file) c.write_binary_file(CLUSTER) print "Written in %s" % cluster_file if verbose: # PLOT # Read location file locs = read_locs_from_file(loc_filename) # Read station file stations = read_stations_file(stations_filename) # Look at the waveforms #plot_traces(CLUSTER, delay_file, coeff, locs, # data_dir, data_files, threshold) # Plot graphs plot_graphs(locs, stations, nbsta, CLUSTER, nbmin, threshold)