Source code for migration

#!/usr/bin/env python
# encoding: utf-8

import os
import h5py
import logging
import glob
import numpy as np

from obspy.core import utcdatetime
from itertools import count, islice
from time import time

from OP_waveforms import read_data_compatible_with_time_dict
from NllGridLib import read_hdr_file
from hdf5_grids import get_interpolated_time_grids
from filters import smooth


[docs]def do_migration_setup_and_run(opdict): """ Do setup and launch migration. :param opdict: WavelocOptions.opdict """ base_path = opdict['base_path'] runtime = opdict['time'] reloc = opdict['reloc'] # data data_dir = os.path.join(base_path, 'data', opdict['datadir']) if opdict['kderiv']: data_glob = opdict['gradglob'] if opdict['gauss']: data_glob = opdict['gaussglob'] else: data_glob = opdict['kurtglob'] data_files = glob.glob(os.path.join(data_dir, data_glob)) data_files.sort() if len(data_files) == 0: logging.error('No data files found for %s and %s' % (data_dir, data_glob)) raise UserWarning # grids search_grid_filename = os.path.join(base_path, 'lib', opdict['search_grid']) time_grids = get_interpolated_time_grids(opdict) #start and end times starttime = opdict['starttime'] endtime = opdict['endtime'] data_length = opdict['data_length'] data_overlap = opdict['data_overlap'] initial_start_time = utcdatetime.UTCDateTime(starttime) initial_end_time = initial_start_time+data_length final_end_time = utcdatetime.UTCDateTime(endtime) time_shift_secs = data_length-data_overlap ######### FOR EACH TIME SPAN - DO MIGRATION ############# # start loop over time start_time = initial_start_time end_time = initial_end_time if runtime: t_ref = time() while (start_time < final_end_time): # read data logging.info("Reading data : %s - %s." % (start_time.isoformat(), end_time.isoformat())) data, delta = \ read_data_compatible_with_time_dict(data_files, time_grids, start_time, end_time) if reloc: tr_glob = opdict['kurtglob'] files = glob.glob(os.path.join(data_dir, tr_glob)) traces, delta = \ read_data_compatible_with_time_dict(files, time_grids, start_time, end_time) sta_list = sorted(traces) for staname in sta_list: snr = np.max(traces[staname])/np.mean(np.abs(traces[staname])) if snr < opdict['reloc_snr']: data[staname] = np.zeros(len(data[staname])) # re-read grid_info at each iteration to make sure it is a clean copy grid_info = read_hdr_file(search_grid_filename) # do migration if have enough data (3 is bare minimum) if len(data.keys()) >= 3: logging.info("Migrating data : %s - %s." % (start_time.isoformat(), end_time.isoformat())) do_migration_loop_continuous(opdict, data, delta, start_time, grid_info, time_grids) elif len(data.keys()) == 0: logging.warn('No data found between %s and %s.' % (start_time.isoformat(), end_time.isoformat())) else: logging.warn('Insufficient data found between %s and %s.' % (start_time.isoformat(), end_time.isoformat())) # Reset the start and end times to loop again start_time = start_time+time_shift_secs end_time = end_time+time_shift_secs if runtime: t = time()-t_ref logging.info("Time for migrating all time slices : %.2f s\n" % (t))
[docs]def do_migration_loop_continuous(opdict, data, delta, start_time, grid_info, time_grids, keep_grid=False, keep_stacks=True): """ Do continuous migration loop. TODO : flesh out this doc-string. """ logging.info("Processing time slice %s" % start_time.isoformat()) options_time = opdict['time'] use_ram = opdict['use_ram'] output_dir = os.path.join(opdict['base_path'], 'out', opdict['outdir']) options_reloc = opdict['reloc'] nx = grid_info['nx'] ny = grid_info['ny'] nz = grid_info['nz'] n_buf = nx*ny*nz min_npts = min([len(data[key]) for key in data.keys()]) dx = grid_info['dx'] dy = grid_info['dy'] dz = grid_info['dz'] x_orig = grid_info['x_orig'] y_orig = grid_info['y_orig'] z_orig = grid_info['z_orig'] if options_time: t_ref = time() # open hdf5 file for stack_grid if options_reloc: grid_filename = os.path.join(output_dir, 'grid', 'reloc_stack_grid_%s.hdf5' % start_time) else: grid_filename = os.path.join(output_dir, 'grid', 'stack_grid_%s.hdf5' % start_time) logging.info('Creating grid file %s' % grid_filename) # if running on small memory machine write to hdf5 file if not use_ram: f = h5py.File(grid_filename, 'w') stack_grid = f.create_dataset('stack_grid', (n_buf, min_npts), 'f', chunks=(1, min_npts)) stack_grid[...] = 0. else: # if running on big memory machine work in ram stack_grid = np.empty((n_buf, min_npts), dtype='float32', order='F') stack_grid[...] = 0. # DO MIGRATION #logging.debug("Data sum = %.6f"%np.sum(data.values())) stack_shift_time = migrate_4D_stack(data, delta, time_grids, stack_grid, use_ram) stack_start_time = start_time-stack_shift_time n_buf, nt = stack_grid.shape if options_time: t = time()-t_ref logging.info("Time for migrating %d stacks, each of extent %d points :\ %.2f s\n" % (n_buf, nt, t)) if keep_stacks: if options_time: t_ref = time() if options_reloc: stack_filename = os.path.join(output_dir, 'stack', 'reloc_stack_all_%s.hdf5' % start_time) else: stack_filename = os.path.join(output_dir, 'stack', 'stack_all_%s.hdf5' % start_time) logging.info('Extracting max_val etc. to %s' % stack_filename) f_stack = h5py.File(stack_filename, 'w') # extract maxima extract_max_values(stack_grid, grid_info, f_stack, use_ram=use_ram) for name in f_stack: dset = f_stack[name] logging.debug('After extract_max_values : %s %f %f' % (name, np.max(dset), np.sum(dset))) dset.attrs['start_time'] = stack_start_time.isoformat() dset.attrs['dt'] = delta f_stack.close() # keep the stack_filename grid_info['stack_file'] = stack_filename if options_time: t = time()-t_ref logging.info("Time for extracting maxima : %.2f s\n" % (t)) if keep_grid: if not use_ram: # working on small-ram machine sg = stack_grid else: # working on big-ram machine #create the hdf5 file f = h5py.File(grid_filename, 'w') sg = f.create_dataset('stack_grid', data=stack_grid) # add useful attributes to the hdf5 dataset for key, value in grid_info.iteritems(): sg.attrs[key] = value sg.attrs['dt'] = delta sg.attrs['start_time'] = stack_start_time.isoformat() # keep the grid_filename grid_info['dat_file'] = grid_filename # hdf5 files cannot have complex attributes # so add these to grid_info after having added # useful info to stack_grid.attrs grid_info['grid_spacing'] = (dx, dy, dz, delta) grid_info['grid_orig'] = (x_orig, y_orig, z_orig) grid_info['grid_shape'] = (nx, ny, nz, nt) grid_info['start_time'] = stack_start_time # close the hdf5 file for the grid if (not use_ram) or keep_grid: f.close() # remove the grid file unless you want to keep it if (not keep_grid) and (not use_ram): logging.info('Removing grid file %s' % grid_filename) os.remove(grid_filename)
[docs]def migrate_4D_stack(data, delta, time_grids, stack_grid, use_ram=False): """ Migrate the 4D stack. Central part of the waveloc process. TODO : flesh out this doc-string :param data: :param delta: :param time_grids: :param stack_grid: :param use_ram: """ # save the list of data keys # note : keys of data are all included in keys of time_grid, but there may # be more times than data wf_ids = data.keys() n_wf_ids = len(wf_ids) n_buf, min_npts = stack_grid.shape logging.debug("Stack max dimension = %d x %d" % (n_buf, min_npts)) # initialize the arrays we will need tmp_stack = np.empty(min_npts, dtype='float32') tmp_stack[:] = 0. i_times = np.zeros((n_wf_ids, n_buf), dtype='int') i_max_times = np.zeros(n_buf, dtype='int') i_min_times = np.zeros(n_buf, dtype='int') start_index = np.zeros(n_buf, dtype='int') start_indexes = np.zeros((n_wf_ids, n_buf), dtype='int') end_indexes = np.zeros((n_wf_ids, n_buf), dtype='int') n_lens = np.zeros((n_wf_ids, n_buf), dtype='int') # construct grid (n_buf x n_sta) grid of time_indexes for migration for i in islice(count(0), n_wf_ids): wf_id = wf_ids[i] i_times[i, :] = np.round(time_grids[wf_id].grid_data[:]/delta)/1 # find the min and max time indexes for point i_min_times = np.min(i_times, 0) i_max_times = np.max(i_times, 0) iextreme_min_time = np.min(i_min_times) iextreme_max_time = np.max(i_max_times) start_index = i_min_times-iextreme_min_time stack_shift_time = delta*iextreme_min_time # find start indexes, end indexes and lengths for each station and point start_indexes = i_times-i_min_times end_indexes = i_times-i_max_times+min_npts n_lens = end_indexes-start_indexes # keep the shortest length for each point n_len = np.min(n_lens, 0) # keep the shortest overall length shortest_n_len = np.min(n_len) # sill fix the length of the stack to the shortest possible length given # all the previous travel time information norm_stack_len = shortest_n_len-iextreme_max_time if norm_stack_len < 0: logging.error('Data length too short for coherent migration across\ network') # the actual migration loop # cannot seem to vectorize this any more... too bad !! for ib in islice(count(0), n_buf): # This is ugly, but necessary to avoid memory leak from inner loop _do_stack(ib, n_wf_ids, wf_ids, stack_grid, data, min_npts, n_lens, start_indexes, end_indexes, start_index, norm_stack_len) # clean up what is no longer needed del i_times, i_min_times, i_max_times, start_indexes, end_indexes del n_lens, start_index # resize stack_grid if use_ram: stack_grid.resize((n_buf, norm_stack_len), refcheck=False) else: stack_grid.resize(norm_stack_len, axis=1) # end return stack_shift_time
def _do_stack(ib, n_wf_ids, wf_ids, stack_grid, data, min_npts, n_lens, start_indexes, end_indexes, start_index, norm_stack_len): tmp_stack = np.empty(min_npts, dtype='float32') tmp_stack[:] = 0. # stack shifted data from each station for i in islice(count(0), n_wf_ids): tmp_stack[0:n_lens[i, ib]] += \ data[wf_ids[i]][start_indexes[i, ib]:end_indexes[i, ib]] # We need to homogenize, and get everything to start and end at the same # time stack_grid[ib, 0:norm_stack_len] = \ tmp_stack[start_index[ib]:start_index[ib]+norm_stack_len] # cleanup del tmp_stack
[docs]def extract_max_values(stack_grid, search_info, f_stack, use_ram=False, n_max=5e8): """ Extracts maximum stack value from a 4D migrated grid. Also extracts x_max, y_max, z_max. TODO : flesh out this doc-string. :param stack_grid: :param search_info: :param f_stack: :param use_fam: :param n_max: """ # get basic info nx = search_info['nx'] ny = search_info['ny'] nz = search_info['nz'] dx = search_info['dx'] dy = search_info['dy'] dz = search_info['dz'] x_orig = search_info['x_orig'] y_orig = search_info['y_orig'] z_orig = search_info['z_orig'] nb, nt = stack_grid.shape if use_ram: max_val = np.empty(nt, dtype='float32') max_val_smooth = np.empty(nt, dtype='float32') max_x = np.empty(nt, dtype='float32') max_y = np.empty(nt, dtype='float32') max_z = np.empty(nt, dtype='float32') # create temporary datasets max_ib = np.empty(nt, dtype=int) max_ix = np.empty(nt, dtype=int) max_iy = np.empty(nt, dtype=int) max_iz = np.empty(nt, dtype=int) else: max_val = f_stack.create_dataset('max_val', (nt, ), 'f') max_val_smooth = f_stack.create_dataset('max_val_smooth', (nt, ), 'f') max_x = f_stack.create_dataset('max_x', (nt, ), 'f') max_y = f_stack.create_dataset('max_y', (nt, ), 'f') max_z = f_stack.create_dataset('max_z', (nt, ), 'f') # create temporary datasets max_ib = f_stack.create_dataset('max_ib', (nt, ), 'i') max_ix = f_stack.create_dataset('max_ix', (nt, ), 'i') max_iy = f_stack.create_dataset('max_iy', (nt, ), 'i') max_iz = f_stack.create_dataset('max_iz', (nt, ), 'i') # extract values dt = int(n_max/nb) if nt <= dt: # do the extraction in one step max_ib[:] = np.argmax(stack_grid, 0) max_val[:] = np.max(stack_grid, 0) else: # do the extraction in steps n = nt/dt logging.info('Number of values exceeds %d. Doing extraction in %d\ steps' % (n_max, n)) for i in islice(count(0), n): max_ib[i*dt:(i+1)*dt] = np.argmax(stack_grid[:, i*dt:(i+1)*dt], 0) max_val[i*dt:(i+1)*dt] = np.max(stack_grid[:, i*dt:(i+1)*dt], 0) max_ib[n*dt:nt] = np.argmax(stack_grid[:, n*dt:nt], 0) max_val[n*dt:nt] = np.max(stack_grid[:, n*dt:nt], 0) # find the corresponding x,y,z values max_ix, max_iy, max_iz = np.unravel_index(max_ib, (nx, ny, nz)) max_x[:] = max_ix[:]*dx+x_orig max_y[:] = max_iy[:]*dy+y_orig max_z[:] = max_iz[:]*dz+z_orig max_val_smooth[:] = smooth(np.array(max_val), 51) if use_ram: # add datasets to hdf5file max_val = f_stack.create_dataset('max_val', data=max_val) max_val_smooth = f_stack.create_dataset('max_val_smooth', data=max_val_smooth) max_x = f_stack.create_dataset('max_x', data=max_x) max_y = f_stack.create_dataset('max_y', data=max_y) max_z = f_stack.create_dataset('max_z', data=max_z) else: # clean up temporary datasets del f_stack['max_ib'] del f_stack['max_ix'] del f_stack['max_iy'] del f_stack['max_iz']