LALApps 10.1.0.1-eeff03c
lalapps_string_final.py
Go to the documentation of this file.
1##python
2#
3# Copyright (C) 2006--2010 Kipp Cannon
4#
5# This program is free software; you can redistribute it and/or modify it
6# under the terms of the GNU General Public License as published by the
7# Free Software Foundation; either version 2 of the License, or (at your
8# option) any later version.
9#
10# This program is distributed in the hope that it will be useful, but
11# WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
13# Public License for more details.
14#
15# You should have received a copy of the GNU General Public License along
16# with this program; if not, write to the Free Software Foundation, Inc.,
17# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18
19
20#
21# =============================================================================
22#
23# Preamble
24#
25# =============================================================================
26#
27
28
29"""
30String cusp search final output rendering tool.
31"""
32
33
34from __future__ import print_function
35
36
37import bisect
38import copyreg
39import pickle
40import heapq
41import itertools
42from optparse import OptionParser
43import math
44from matplotlib import patches
45import numpy
46import os
47import random
48import select
49from scipy import interpolate
50from scipy import optimize
51import sqlite3
52import sys
53import traceback
54
55
56from igwn_ligolw import dbtables
57from igwn_ligolw.utils import process as ligolwprocess
58import lal
59from lal import rate
60from lal.utils import CacheEntry
61from lalburst import git_version
62from lalburst import packing
63from lalburst import SimBurstUtils
64from lalburst import SnglBurstUtils
65from lalburst import stringutils
66import igwn_segments as segments
67from igwn_segments import utils as segmentsUtils
68
69
70SnglBurstUtils.matplotlib.rcParams.update({
71 "font.size": 10.0,
72 "text.usetex": True,
73 "axes.titlesize": 10.0,
74 "axes.labelsize": 10.0,
75 "xtick.labelsize": 8.0,
76 "ytick.labelsize": 8.0,
77 "legend.fontsize": 8.0,
78 "grid.linestyle": "-"
79})
80
81
82__author__ = "Kipp Cannon <kipp.cannon@ligo.org>"
83__version__ = "git id %s" % git_version.id
84__date__ = git_version.date
85
86
87copyreg.pickle(lal.LIGOTimeGPS, lambda gps: (lal.LIGOTimeGPS, (gps.gpsSeconds, gps.gpsNanoSeconds)))
88
89
90#
91# =============================================================================
92#
93# Command Line
94#
95# =============================================================================
96#
97
98
100 parser = OptionParser(
101 version = "Name: %%prog\n%s" % git_version.verbose_msg,
102 usage = "%prog [options] [file ...]",
103 description = "%prog performs the final, summary, stages of the upper-limit string cusp search. Input consists of a list of all sqlite format database files produced by all injection and non-injection runs of the analysis pipeline. The file names can be given on the command line and/or provided in a LAL cache file."
104 )
105 parser.add_option("--cal-uncertainty", metavar = "fraction", type = "float", help = "Set the fractional uncertainty in amplitude due to calibration uncertainty (eg. 0.08). This option is required, use 0 to disable calibration uncertainty.")
106 parser.add_option("--injections-bin-size", metavar = "bins", type = "float", default = 16.7, help = "Set bin width for injection efficiency curves (default = 16.7).")
107 parser.add_option("-c", "--input-cache", metavar = "filename", action = "append", help = "Also process the files named in this LAL cache. See lalapps_path2cache for information on how to produce a LAL cache file. This option can be given multiple times.")
108 parser.add_option("--import-dump", metavar = "filename", action = "append", help = "Import additional rate vs. threshold or efficiency data from this dump file. Dump files are one of the data products produced by this program. Whether the file provides rate vs. threshold data or efficiency data will be determined automatically. This option can be given multiple times")
109 parser.add_option("--image-formats", metavar = "ext[,ext,...]", default = "png,pdf", help = "Set list of graphics formats to produce by providing a comma-delimited list of the filename extensions (default = \"png,pdf\").")
110 parser.add_option("-p", "--live-time-program", metavar = "program", default = "StringSearch", help = "Set the name, as it appears in the process table, of the program whose search summary entries define the search live time (default = StringSearch).")
111 parser.add_option("-o", "--open-box", action = "store_true", help = "Perform open-box analysis. In a closed-box analysis (the default), information about the events seen at zero-lag is concealed: the rate vs. threshold plot only shows the rate of events seen in the background, the detection threshold used to measure the efficiency curves is obtained from n-th loudest background event where n is (the integer closest to) the ratio of background livetime to zero-lag livetime, and messages to stdout and stderr that contain information about event counts at zero-lag are silenced.")
112 parser.add_option("-t", "--tmp-space", metavar = "path", help = "Path to a directory suitable for use as a work area while manipulating the database file. The database file will be worked on in this directory, and then moved to the final location when complete. This option is intended to improve performance when running in a networked environment, where there might be a local disk with higher bandwidth than is available to the filesystem on which the final output will reside.")
113 parser.add_option("--vetoes-name", metavar = "name", help = "Set the name of the segment lists to use as vetoes (default = do not apply vetoes).")
114 parser.add_option("--detection-threshold", metavar = "log likelihood ratio", type = "float", help = "Override the detection threshold. Only injection files will be processed, and the efficiency curve measured.")
115 parser.add_option("--record-background", metavar = "N", type = "int", default = 10000000, help = "Set the number of background log likelihood ratios to hold in memory for producing the rate vs. threshold plot (default = 10000000).")
116 parser.add_option("--record-candidates", metavar = "N", type = "int", default = 100, help = "Set the number of highest-ranked zero-lag candidates to dump to the candidate file (default = 100).")
117 parser.add_option("--threads", metavar = "N", type = "int", default = 1, help = "Set the maximum number of parallel threads to use for processing files (default = 1). Contention for the global Python interpreter lock will throttle the true number that can run. The number of threads will be automatically adjusted downwards if the number requested exceeds the number of input files.")
118 parser.add_option("-v", "--verbose", action = "store_true", help = "Be verbose.")
119 options, filenames = parser.parse_args()
120
121 if options.cal_uncertainty is None:
122 raise ValueError("must set --cal-uncertainty (use 0 to ignore calibration uncertainty)")
123
124 options.image_formats = options.image_formats.split(",")
125
126 if options.input_cache:
127 filenames += [CacheEntry(line).path for input_cache in options.input_cache for line in file(input_cache)]
128
129 if options.threads < 1:
130 raise ValueError("--threads must be >= 1")
131
132 return options, filenames
133
134
135#
136# =============================================================================
137#
138# Rate vs. Threshold
139#
140# =============================================================================
141#
142
143
144def ratevsthresh_bounds(background_x, background_y, zero_lag_x, zero_lag_y):
145 if len(zero_lag_x):
146 if len(background_x):
147 minX, maxX = min(min(zero_lag_x), min(background_x)), max(max(zero_lag_x), max(background_x))
148 minY, maxY = min(min(zero_lag_y), min(background_y)), max(max(zero_lag_y), max(background_y))
149 else:
150 minX, maxX = min(zero_lag_x), max(zero_lag_x)
151 minY, maxY = min(zero_lag_y), max(zero_lag_y)
152 else:
153 # don't check for background, if there's no zero-lag and no
154 # background we're screwed anyway
155 minX, maxX = min(background_x), max(background_x)
156 minY, maxY = min(background_y), max(background_y)
157
158 # override the plot's X axis lower bound
159 minX = 1e-2 # FIXME: don't hard-code
160 if len(zero_lag_x):
161 if len(background_x):
162 maxY = max(zero_lag_y[bisect.bisect_left(zero_lag_x, minX)], background_y[bisect.bisect_left(background_x, minX)])
163 else:
164 maxY = zero_lag_y[bisect.bisect_left(zero_lag_x, minX)]
165 else:
166 maxY = background_y[bisect.bisect_left(background_x, minX)]
167
168 return minX, maxX, minY, maxY
169
170
171def expected_count(x, background_x, background_y):
172 if x > background_x[-1]:
173 return 0
174 return background_y[bisect.bisect_left(background_x, x)]
175
176
178 #
179 # construct a background mask to retain the highest-ranked 10,000
180 # elements, then every 10th until the 100,000th highest-ranked
181 # element, then every 100th after that. this is for reducing the
182 # dataset size so matplotlib can handle it and vector graphics
183 # output isn't ridiculous in size.
184 #
185
186 mask = numpy.arange(len(x))[::-1]
187 mask = (mask < 10000) | ((mask < 100000) & (mask % 10 == 0)) | (mask % 100 == 0)
188
189 return x.compress(mask), y.compress(mask), yerr.compress(mask)
190
191
192class RateVsThreshold(object):
193 def __init__(self, open_box, record_background, record_candidates):
194 self.zero_lag = []
195 self.background = []
197 self.record_background = record_background
199 self.candidates = []
200 self.record_candidates = record_candidates
201 self.zero_lag_time = 0.0
203 self.open_box = open_box
204
205 def __iadd__(self, other):
206 self.zero_lag += other.zero_lag
207 self.n_background += other.n_background
208 self.background[:] = heapq.nlargest(self.record_background, itertools.chain(self.background, other.background))
209 self.most_significant_background[:] = heapq.nlargest(self.record_candidates, itertools.chain(self.most_significant_background, other.most_significant_background))
210 self.candidates[:] = heapq.nlargest(self.record_candidates, itertools.chain(self.candidates, other.candidates))
211 self.zero_lag_time += other.zero_lag_time
212 self.background_time += other.background_time
213 return self
214
215 def add_contents(self, contents, verbose = False):
216 #
217 # retrieve the offset vectors, retain only instruments that
218 # are available
219 #
220
221 zero_lag_time_slides, background_time_slides = SnglBurstUtils.get_time_slides(contents.connection)
222 assert len(zero_lag_time_slides) == 1
223
224 #
225 # compute the live time
226 #
227
228 seglists = contents.seglists - contents.vetoseglists
229 self.zero_lag_time += stringutils.time_slides_livetime(seglists, zero_lag_time_slides.values(), 2, clip = contents.coincidence_segments)
230 self.background_time += stringutils.time_slides_livetime(seglists, background_time_slides.values(), 2, clip = contents.coincidence_segments)
231 if set(("H1", "H2")).issubset(set(contents.seglists)):
232 # we have segments for both H1 and H2, remove time
233 # when exactly that pair are on
234 self.zero_lag_time -= stringutils.time_slides_livetime_for_instrument_combo(seglists, zero_lag_time_slides.values(), ("H1", "H2"), clip = contents.coincidence_segments)
235 self.background_time -= stringutils.time_slides_livetime_for_instrument_combo(seglists, background_time_slides.values(), ("H1", "H2"), clip = contents.coincidence_segments)
236
237 #
238 # count events
239 #
240
241 for ln_likelihood_ratio, instruments, coinc_event_id, peak_time, is_background in contents.connection.cursor().execute("""
242SELECT
243 coinc_event.likelihood,
244 coinc_event.instruments,
245 coinc_event.coinc_event_id,
246 (
247 SELECT
248 AVG(sngl_burst.peak_time) + 1e-9 * AVG(sngl_burst.peak_time_ns)
249 FROM
250 sngl_burst
251 JOIN coinc_event_map ON (
252 coinc_event_map.coinc_event_id == coinc_event.coinc_event_id
253 AND coinc_event_map.table_name == 'sngl_burst'
254 AND coinc_event_map.event_id == sngl_burst.event_id
255 )
256 ),
257 EXISTS (
258 SELECT
259 *
260 FROM
261 time_slide
262 WHERE
263 time_slide.time_slide_id == coinc_event.time_slide_id
264 AND time_slide.offset != 0
265 )
266FROM
267 coinc_event
268WHERE
269 coinc_event.coinc_def_id == ?
270 """, (contents.bb_definer_id,)):
271 # likelihood ratio must be listed first to
272 # act as the sort key
273 record = (ln_likelihood_ratio, contents.filename, coinc_event_id, dbtables.lsctables.instrumentsproperty.get(instruments), peak_time)
274 if ln_likelihood_ratio is None:
275 # coinc got vetoed (unable to compute
276 # likelihood)
277 pass
278 elif is_background:
279 # non-vetoed background
280 self.n_background += 1
281 if len(self.background) < self.record_background:
282 heapq.heappush(self.background, ln_likelihood_ratio)
283 else:
284 heapq.heappushpop(self.background, ln_likelihood_ratio)
286 heapq.heappush(self.most_significant_background, record)
287 else:
288 heapq.heappushpop(self.most_significant_background, record)
289 else:
290 # non-vetoed zero lag
291 self.zero_lag.append(ln_likelihood_ratio)
292 if len(self.candidates) < self.record_candidates:
293 heapq.heappush(self.candidates, record)
294 else:
295 heapq.heappushpop(self.candidates, record)
296
297 def finish(self):
298 #
299 # dump info about highest-ranked zero-lag and background
300 # events
301 #
302
304 self.candidates.sort()
305
306 f = file("string_most_significant_background.txt", "w")
307 print("Highest-Ranked Background Events", file=f)
308 print("================================", file=f)
309 for ln_likelihood_ratio, filename, coinc_event_id, instruments, peak_time in self.most_significant_background:
310 print(file=f)
311 print("%s in %s:" % (str(coinc_event_id), filename), file=f)
312 print("Recovered in: %s" % ", ".join(sorted(instruments or [])), file=f)
313 print("Mean peak time: %.16g s GPS" % peak_time, file=f)
314 print("\\log \\Lambda: %.16g" % ln_likelihood_ratio, file=f)
315
316 f = file("string_candidates.txt", "w")
317 print("Highest-Ranked Zero-Lag Events", file=f)
318 print("==============================", file=f)
319 if self.open_box:
320 for ln_likelihood_ratio, filename, coinc_event_id, instruments, peak_time in self.candidates:
321 print(file=f)
322 print("%s in %s:" % (str(coinc_event_id), filename), file=f)
323 print("Recovered in: %s" % ", ".join(sorted(instruments or [])), file=f)
324 print("Mean peak time: %.16g s GPS" % peak_time, file=f)
325 print("\\log \\Lambda: %.16g" % ln_likelihood_ratio, file=f)
326 else:
327 print(file=f)
328 print("List suppressed: box is closed", file=f)
329
330 #
331 # sort the ranking statistics and convert to arrays.
332 #
333
334 self.background.sort()
335 self.zero_lag.sort()
336 self.zero_lag = numpy.array(self.zero_lag, dtype = "double")
337 background_x = numpy.array(self.background, dtype = "double")
338
339 #
340 # convert counts to rates and their uncertainties
341 #
342
343 # background count expected in zero-lag and \sqrt{N}
344 # standard deviation
345 background_y = numpy.arange(len(background_x), 0.0, -1.0, dtype = "double") / self.background_time * self.zero_lag_time
346 background_yerr = numpy.sqrt(background_y)
347
348 # count observed in zero-lag
349 zero_lag_y = numpy.arange(len(self.zero_lag), 0.0, -1.0, dtype = "double")
350
351 # compute zero-lag - expected residual in units of \sqrt{N}
352 zero_lag_residual = numpy.zeros((len(self.zero_lag),), dtype = "double")
353 for i in xrange(len(self.zero_lag)):
354 n = expected_count(self.zero_lag[i], background_x, background_y)
355 zero_lag_residual[i] = (zero_lag_y[i] - n) / math.sqrt(n)
356
357 # convert counts to rates
358 background_y /= self.zero_lag_time
359 background_yerr /= self.zero_lag_time
360 zero_lag_y /= self.zero_lag_time
361
362 #
363 # determine the horizontal and vertical extent of the plot
364 #
365
366 if self.open_box:
367 minX, maxX, minY, maxY = ratevsthresh_bounds(background_x, background_y, self.zero_lag, zero_lag_y)
368 else:
369 minX, maxX, minY, maxY = ratevsthresh_bounds(background_x, background_y, [], [])
370
371 #
372 # compress the background data
373 #
374
375 background_x, background_y, background_yerr = compress_ratevsthresh_curve(background_x, background_y, background_yerr)
376
377 #
378 # save data points in a text file
379 #
380
381 numpy.savetxt('string_rate_background.txt',numpy.transpose((background_x,background_y,background_yerr)))
382
383 #
384 # start the rate vs. threshold plot
385 #
386
387 ratefig, axes = SnglBurstUtils.make_burst_plot(r"Ranking Statistic Threshold, $\log \Lambda$", "Event Rate (Hz)", width = 108.0)
388 axes.semilogy()
389 axes.set_position([0.125, 0.15, 0.83, 0.75])
390 axes.xaxis.grid(True, which = "major,minor")
391 axes.yaxis.grid(True, which = "major,minor")
392 if self.open_box:
393 axes.set_title(r"Event Rate vs.\ Ranking Statistic Threshold")
394 else:
395 axes.set_title(r"Event Rate vs.\ Ranking Statistic Threshold (Closed Box)")
396
397 # warning: the error bar polygon is not *really* clipped
398 # to the axes' bounding box, the result will be incorrect
399 # if the density of data points is small where the polygon
400 # encounters the axes' bounding box.
401
402 poly_x = numpy.concatenate((background_x, background_x[::-1]))
403 poly_y = numpy.concatenate((background_y + 1 * background_yerr, (background_y - 1 * background_yerr)[::-1]))
404 axes.add_patch(patches.Polygon(zip(poly_x, numpy.clip(poly_y, minY, maxY)), edgecolor = "k", facecolor = "k", alpha = 0.3))
405 line1, = axes.semilogy(background_x.repeat(2)[:-1], background_y.repeat(2)[1:], color = "k", linestyle = "--")
406 if self.open_box:
407 line2, = axes.semilogy(self.zero_lag.repeat(2)[:-1], zero_lag_y.repeat(2)[1:], color = "k", linestyle = "-", linewidth = 2)
408 axes.legend((line1, line2), (r"Expected background", r"Zero-lag"), loc = "lower left")
409 else:
410 axes.legend((line1,), (r"Expected background",), loc = "lower left")
411
412 axes.set_xlim((minX, maxX))
413 axes.set_ylim((minY, maxY))
414
415 #
416 # start the count residual vs. threshold plot
417 #
418
419 residualfig, axes = SnglBurstUtils.make_burst_plot(r"Ranking Statistic Threshold, $\log \Lambda$", r"Event Count Residual / $\sqrt{N}$", width = 108.0)
420 axes.set_position([0.125, 0.15, 0.83, 0.75])
421 axes.xaxis.grid(True, which = "major,minor")
422 axes.yaxis.grid(True, which = "major,minor")
423 if self.open_box:
424 axes.set_title(r"Event Count Residual vs.\ Ranking Statistic Threshold")
425 else:
426 axes.set_title(r"Event Count Residual vs.\ Ranking Statistic Threshold (Closed Box)")
427
428 axes.add_patch(patches.Polygon(((minX, -1), (maxX, -1), (maxX, +1), (minX, +1)), edgecolor = "k", facecolor = "k", alpha = 0.3))
429 if self.open_box:
430 line1, = axes.plot(self.zero_lag, zero_lag_residual, color = "k", linestyle = "-", linewidth = 2)
431
432 axes.set_xlim((minX, maxX))
433
434 #
435 # done
436 #
437
438 return ratefig, residualfig
439
440
441#
442# =============================================================================
443#
444# Efficiency
445#
446# =============================================================================
447#
448
449
450def slope(x, y):
451 """
452 From the x and y arrays, compute the slope at the x co-ordinates
453 using a first-order finite difference approximation.
454 """
455 slope = numpy.zeros((len(x),), dtype = "double")
456 slope[0] = (y[1] - y[0]) / (x[1] - x[0])
457 for i in xrange(1, len(x) - 1):
458 slope[i] = (y[i + 1] - y[i - 1]) / (x[i + 1] - x[i - 1])
459 slope[-1] = (y[-1] - y[-2]) / (x[-1] - x[-2])
460 return slope
461
462
463def upper_err(y, yerr, deltax):
464 z = y + yerr
465 deltax = int(deltax)
466 upper = numpy.zeros((len(yerr),), dtype = "double")
467 for i in xrange(len(yerr)):
468 upper[i] = max(z[max(i - deltax, 0) : min(i + deltax, len(z))])
469 return upper - y
470
471
472def lower_err(y, yerr, deltax):
473 z = y - yerr
474 deltax = int(deltax)
475 lower = numpy.zeros((len(yerr),), dtype = "double")
476 for i in xrange(len(yerr)):
477 lower[i] = min(z[max(i - deltax, 0) : min(i + deltax, len(z))])
478 return y - lower
479
480
481def write_efficiency(fileobj, bins, eff, yerr):
482 print("# A e D[e]", file=fileobj)
483 for A, e, De in zip(bins.centres()[0], eff, yerr):
484 print("%.16g %.16g %.16g" % (A, e, De), file=fileobj)
485
486
487def render_data_from_bins(dump_file, axes, efficiency_num, efficiency_den, cal_uncertainty, filter_width, colour = "k", erroralpha = 0.3, linestyle = "-"):
488 # extract array of x co-ordinates, and the factor by which x
489 # increases from one sample to the next.
490 (x,) = efficiency_den.centres()
491 x_factor_per_sample = efficiency_den.bins[0].delta
492
493 # compute the efficiency, the slope (units = efficiency per
494 # sample), the y uncertainty (units = efficiency) due to binomial
495 # counting fluctuations, and the x uncertainty (units = samples)
496 # due to the width of the smoothing filter.
497 eff = efficiency_num.array / efficiency_den.array
498 dydx = slope(numpy.arange(len(x), dtype = "double"), eff)
499 yerr = numpy.sqrt(eff * (1. - eff) / efficiency_den.array)
500 xerr = numpy.array([filter_width / 2.] * len(yerr))
501
502 # compute the net y err (units = efficiency) by (i) multiplying the
503 # x err by the slope, (ii) dividing the calibration uncertainty
504 # (units = percent) by the fractional change in x per sample and
505 # multiplying by the slope, (iii) adding the two in quadradure with
506 # the y err.
507 net_yerr = numpy.sqrt((xerr * dydx)**2. + yerr**2. + (cal_uncertainty / x_factor_per_sample * dydx)**2.)
508
509 # compute net xerr (units = percent) by dividing yerr by slope and
510 # then multiplying by the fractional change in x per sample.
511 net_xerr = net_yerr / dydx * x_factor_per_sample
512
513 # write the efficiency data to file
514 write_efficiency(dump_file, efficiency_den.bins, eff, net_yerr)
515
516 # plot the efficiency curve and uncertainty region
517 patch = patches.Polygon(zip(numpy.concatenate((x, x[::-1])), numpy.concatenate((eff + upper_err(eff, yerr, filter_width / 2.), (eff - lower_err(eff, yerr, filter_width / 2.))[::-1]))), edgecolor = colour, facecolor = colour, alpha = erroralpha)
518 axes.add_patch(patch)
519 line, = axes.plot(x, eff, colour + linestyle)
520
521 # compute 50% point and its uncertainty
522 A50 = optimize.bisect(interpolate.interp1d(x, eff - 0.5), x[0], x[-1], xtol = 1e-40)
523 A50_err = interpolate.interp1d(x, net_xerr)(A50)
524
525 # mark 50% point on graph
526 #axes.axvline(A50, color = colour, linestyle = linestyle)
527
528 # print some analysis FIXME: this calculation needs attention
529 num_injections = efficiency_den.array.sum()
530 num_samples = len(efficiency_den.array)
531 print("Bins were %g samples wide, ideal would have been %g" % (filter_width, (num_samples / num_injections / interpolate.interp1d(x, dydx)(A50)**2.0)**(1.0/3.0)), file=sys.stderr)
532 print("Average number of injections in each bin = %g" % efficiency_den.array.mean(), file=sys.stderr)
533
534 return line, A50, A50_err
535
536
537class Efficiency(object):
538 def __init__(self, detection_threshold, cal_uncertainty, filter_width, open_box):
539 self.detection_threshold = detection_threshold
540 self.cal_uncertainty = cal_uncertainty
541 self.filter_width = filter_width
542 self.seglists = segments.segmentlistdict()
543 self.vetoseglists = segments.segmentlistdict()
544 self.found = []
545 self.n_diagnostics = 100 # keep 100 loudest missed and quietest found injections
548 self.all = []
549 self.open_box = open_box
550
551 def __iadd__(self, other):
552 assert other.detection_threshold == self.detection_threshold
553 assert other.open_box == self.open_box
554 self.seglists |= other.seglists
555 self.vetoseglists |= other.vetoseglists
556 self.found += other.found
557 self.loudest_missed[:] = heapq.nlargest(self.n_diagnostics, itertools.chain(self.loudest_missed, other.loudest_missed))
558 self.quietest_found[:] = heapq.nlargest(self.n_diagnostics, itertools.chain(self.quietest_found, other.quietest_found))
559 self.all += other.all
560 return self
561
562 def add_contents(self, contents, verbose = False):
563 #
564 # update segment information
565 #
566
567 self.seglists |= contents.seglists
568 self.vetoseglists |= contents.vetoseglists
569 seglists = contents.seglists - contents.vetoseglists
570 if set(("H1", "H2")).issubset(set(seglists)):
571 # we have segments for both H1 and H2, remove time
572 # when exactly that pair are on
573 seglists -= segments.segmentlistdict.fromkeys(seglists, seglists.intersection(("H1", "H2")) - seglists.union(set(seglists) - set(("H1", "H2"))))
574
575 # for each injection, retrieve the highest likelihood ratio
576 # of the burst coincs that were found to match the
577 # injection or null if no burst coincs matched the
578 # injection
579 offsetvectors = contents.time_slide_table.as_dict()
580 stringutils.create_recovered_ln_likelihood_ratio_table(contents.connection, contents.bb_definer_id)
581 for values in contents.connection.cursor().execute("""
582SELECT
583 sim_burst.*,
584 recovered_ln_likelihood_ratio.ln_likelihood_ratio
585FROM
586 sim_burst
587 LEFT JOIN recovered_ln_likelihood_ratio ON (
588 recovered_ln_likelihood_ratio.simulation_id == sim_burst.simulation_id
589 )
590 """):
591 sim = contents.sim_burst_table.row_from_cols(values[:-1])
592 ln_likelihood_ratio = values[-1]
593 found = ln_likelihood_ratio is not None
594 # were at least 2 instruments on when the injection
595 # was made?
596 if len(SimBurstUtils.on_instruments(sim, seglists, offsetvectors[sim.time_slide_id])) >= 2:
597 # yes
598 self.all.append(sim)
599 if found and ln_likelihood_ratio > self.detection_threshold:
600 self.found.append(sim)
601 # 1/amplitude needs to be first so
602 # that it acts as the sort key
603 record = (1.0 / sim.amplitude, sim, offsetvectors[sim.time_slide_id], contents.filename, ln_likelihood_ratio)
604 if len(self.quietest_found) < self.n_diagnostics:
605 heapq.heappush(self.quietest_found, record)
606 else:
607 heapq.heappushpop(self.quietest_found, record)
608 else:
609 # amplitude needs to be first so
610 # that it acts as the sort key
611 record = (sim.amplitude, sim, offsetvectors[sim.time_slide_id], contents.filename, ln_likelihood_ratio)
612 if len(self.loudest_missed) < self.n_diagnostics:
613 heapq.heappush(self.loudest_missed, record)
614 else:
615 heapq.heappushpop(self.loudest_missed, record)
616 elif found:
617 # no, but it was found anyway
618 print("%s: odd, injection %s was found but not injected ..." % (contents.filename, sim.simulation_id), file=sys.stderr)
619
620 def finish(self):
621 fig, axes = SnglBurstUtils.make_burst_plot(r"Injection Amplitude (\‍(\mathrm{s}^{-\frac{1}{3}}\‍))", "Detection Efficiency", width = 108.0)
622 axes.set_title(r"Detection Efficiency vs.\ Amplitude")
623 axes.semilogx()
624 axes.set_position([0.10, 0.150, 0.86, 0.77])
625
626 # set desired yticks
627 axes.set_yticks((0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0))
628 axes.set_yticklabels((r"\‍(0\‍)", r"\‍(0.1\‍)", r"\‍(0.2\‍)", r"\‍(0.3\‍)", r"\‍(0.4\‍)", r"\‍(0.5\‍)", r"\‍(0.6\‍)", r"\‍(0.7\‍)", r"\‍(0.8\‍)", r"\‍(0.9\‍)", r"\‍(1.0\‍)"))
629 axes.xaxis.grid(True, which = "major,minor")
630 axes.yaxis.grid(True, which = "major,minor")
631
632 # put made and found injections in the denominators and
633 # numerators of the efficiency bins
634 bins = rate.NDBins((rate.LogarithmicBins(min(sim.amplitude for sim in self.all), max(sim.amplitude for sim in self.all), 400),))
635 efficiency_num = rate.BinnedArray(bins)
636 efficiency_den = rate.BinnedArray(bins)
637 for sim in self.found:
638 efficiency_num[sim.amplitude,] += 1
639 for sim in self.all:
640 efficiency_den[sim.amplitude,] += 1
641
642 # generate and plot trend curves. adjust window function
643 # normalization so that denominator array correctly
644 # represents the number of injections contributing to each
645 # bin: make w(0) = 1.0. note that this factor has no
646 # effect on the efficiency because it is common to the
647 # numerator and denominator arrays. we do this for the
648 # purpose of computing the Poisson error bars, which
649 # requires us to know the counts for the bins
650 windowfunc = rate.gaussian_window(self.filter_width)
651 windowfunc /= windowfunc[len(windowfunc) / 2 + 1]
652 rate.filter_array(efficiency_num.array, windowfunc)
653 rate.filter_array(efficiency_den.array, windowfunc)
654
655 # regularize: adjust unused bins so that the efficiency is
656 # 0, not NaN
657 assert (efficiency_num.array <= efficiency_den.array).all()
658 efficiency_den.array[(efficiency_num.array == 0) & (efficiency_den.array == 0)] = 1
659
660 line1, A50, A50_err = render_data_from_bins(file("string_efficiency.dat", "w"), axes, efficiency_num, efficiency_den, self.cal_uncertainty, self.filter_width, colour = "k", linestyle = "-", erroralpha = 0.2)
661 print("Pipeline's 50%% efficiency point for all detections = %g +/- %g%%\n" % (A50, A50_err * 100), file=sys.stderr)
662
663 # add a legend to the axes
664 axes.legend((line1,), (r"\noindent Injections recovered with $\log \Lambda > %.2f$" % self.detection_threshold,), loc = "lower right")
665
666 # adjust limits
667 axes.set_xlim([3e-22, 3e-19])
668 axes.set_ylim([0.0, 1.0])
669
670 #
671 # dump some information about the highest-amplitude missed
672 # and quietest-amplitude found injections
673 #
674
675 self.loudest_missed.sort(reverse = True)
676 self.quietest_found.sort(reverse = True)
677
678 f = file("string_loud_missed_injections.txt", "w")
679 print("Highest Amplitude Missed Injections", file=f)
680 print("===================================", file=f)
681 for amplitude, sim, offsetvector, filename, ln_likelihood_ratio in self.loudest_missed:
682 print(file=f)
683 print("%s in %s:" % (str(sim.simulation_id), filename), file=f)
684 if ln_likelihood_ratio is None:
685 print("Not recovered", file=f)
686 else:
687 print("Recovered with \\log \\Lambda = %.16g, detection threshold was %.16g" % (ln_likelihood_ratio, self.detection_threshold), file=f)
688 for instrument in self.seglists:
689 print("In %s:" % instrument, file=f)
690 print("\tInjected amplitude:\t%.16g" % SimBurstUtils.string_amplitude_in_instrument(sim, instrument, offsetvector), file=f)
691 print("\tTime of injection:\t%s s" % sim.time_at_instrument(instrument, offsetvector), file=f)
692 print("Amplitude in waveframe:\t%.16g" % sim.amplitude, file=f)
693 t = sim.get_time_geocent()
694 print("Time at geocentre:\t%s s" % t, file=f)
695 print("Segments within 60 seconds:\t%s" % segmentsUtils.segmentlistdict_to_short_string(self.seglists & segments.segmentlistdict((instrument, segments.segmentlist([segments.segment(t-offsetvector[instrument]-60, t-offsetvector[instrument]+60)])) for instrument in self.seglists)), file=f)
696 print("Vetoes within 60 seconds:\t%s" % segmentsUtils.segmentlistdict_to_short_string(self.vetoseglists & segments.segmentlistdict((instrument, segments.segmentlist([segments.segment(t-offsetvector[instrument]-60, t-offsetvector[instrument]+60)])) for instrument in self.vetoseglists)), file=f)
697
698 f = file("string_quiet_found_injections.txt", "w")
699 print("Lowest Amplitude Found Injections", file=f)
700 print("=================================", file=f)
701 for inv_amplitude, sim, offsetvector, filename, ln_likelihood_ratio in self.quietest_found:
702 print(file=f)
703 print("%s in %s:" % (str(sim.simulation_id), filename), file=f)
704 if ln_likelihood_ratio is None:
705 print("Not recovered", file=f)
706 else:
707 print("Recovered with \\log \\Lambda = %.16g, detection threshold was %.16g" % (ln_likelihood_ratio, self.detection_threshold), file=f)
708 for instrument in self.seglists:
709 print("In %s:" % instrument, file=f)
710 print("\tInjected amplitude:\t%.16g" % SimBurstUtils.string_amplitude_in_instrument(sim, instrument, offsetvector), file=f)
711 print("\tTime of injection:\t%s s" % sim.time_at_instrument(instrument, offsetvector), file=f)
712 print("Amplitude in waveframe:\t%.16g" % sim.amplitude, file=f)
713 t = sim.get_time_geocent()
714 print("Time at geocentre:\t%s s" % t, file=f)
715 print("Segments within 60 seconds:\t%s" % segmentsUtils.segmentlistdict_to_short_string(self.seglists & segments.segmentlistdict((instrument, segments.segmentlist([segments.segment(t-offsetvector[instrument]-60, t-offsetvector[instrument]+60)])) for instrument in self.seglists)), file=f)
716 print("Vetoes within 60 seconds:\t%s" % segmentsUtils.segmentlistdict_to_short_string(self.vetoseglists & segments.segmentlistdict((instrument, segments.segmentlist([segments.segment(t-offsetvector[instrument]-60, t-offsetvector[instrument]+60)])) for instrument in self.vetoseglists)), file=f)
717
718 #
719 # done
720 #
721
722 return fig,
723
724
725#
726# =============================================================================
727#
728# Main
729#
730# =============================================================================
731#
732
733
734def group_files(filenames, verbose = False):
735 # figure out which files are injection runs and which aren't
736 injection_filenames = []
737 noninjection_filenames = []
738 for n, filename in enumerate(filenames):
739 if verbose:
740 print("%d/%d: %s" % (n + 1, len(filenames), filename), file=sys.stderr)
741 connection = sqlite3.connect(filename)
742 if "sim_burst" in dbtables.get_table_names(connection):
743 if verbose:
744 print("\t--> injections", file=sys.stderr)
745 injection_filenames.append(filename)
746 else:
747 if verbose:
748 print("\t--> non-injections", file=sys.stderr)
749 noninjection_filenames.append(filename)
750 connection.close()
751 return injection_filenames, noninjection_filenames
752
753
754def pack_files(filenames, n_threads, verbose = False):
755 bins = packing.BiggestIntoEmptiest([packing.Bin() for n in range(n_threads)])
756 bins.packlist((os.stat(filename).st_size, filename) for filename in filenames)
757 return [bin.objects for bin in bins.bins]
758
759
760def import_dumps(filenames, verbose = False):
761 rate_vs_threshold = None
762 efficiency = None
763 for filename in filenames:
764 if verbose:
765 print("\t%s ..." % filename, end=' ', file=sys.stderr)
766 dump = pickle.load(open(filename))
767 if type(dump) is RateVsThreshold:
768 if verbose:
769 print("found rate vs. threshold data", file=sys.stderr)
770 if rate_vs_threshold is None:
771 rate_vs_threshold = dump
772 else:
773 rate_vs_threshold += dump
774 elif type(dump) is Efficiency:
775 if verbose:
776 print("found efficiency data", file=sys.stderr)
777 if efficiency is None:
778 efficiency = dump
779 else:
780 efficiency += dump
781 else:
782 raise ValueError("cannot determine contents of %s" % filename)
783 return rate_vs_threshold, efficiency
784
785
786def process_file(filename, products, live_time_program, tmp_path = None, veto_segments_name = None, verbose = False):
787 #
788 # connect to database and summarize contents
789 #
790
791 working_filename = dbtables.get_connection_filename(filename, tmp_path = tmp_path, verbose = verbose)
792 contents = SnglBurstUtils.CoincDatabase(sqlite3.connect(str(working_filename)), live_time_program, search = "StringCusp", veto_segments_name = veto_segments_name)
793 if verbose:
794 SnglBurstUtils.summarize_coinc_database(contents, filename = working_filename)
795
796 #
797 # augment summary with extra stuff we need. the filename
798 # is recorded for dumping debuggin information related to
799 # missed injections. if burca was run with the
800 # --coincidence-segments option then the value is copied
801 # into a segmentlistdict to facilitate the computation of
802 # livetime
803 #
804
805 contents.filename = filename
806
807 contents.coincidence_segments = ligolwprocess.get_process_params(contents.xmldoc, "lalburst_coinc", "--coincidence-segments")
808 if contents.coincidence_segments:
809 # as a side-effect, this enforces the rule that
810 # burca has been run on the input file exactly once
811 contents.coincidence_segments, = contents.coincidence_segments
812 # FIXME: remove when LAL accepts unicode
813 contents.coincidence_segments = contents.coincidence_segments.encode('utf-8')
814 contents.coincidence_segments = segments.segmentlistdict.fromkeys(contents.seglists, segmentsUtils.from_range_strings(contents.coincidence_segments.split(","), boundtype = dbtables.lsctables.LIGOTimeGPS).coalesce())
815 else:
816 contents.coincidence_segments = None
817
818 #
819 # process contents
820 #
821
822 for n, product in enumerate(products):
823 if verbose:
824 print("%s: adding to product %d ..." % (working_filename, n), file=sys.stderr)
825 product.add_contents(contents, verbose = verbose)
826
827 #
828 # close
829 #
830
831 contents.connection.close()
832 dbtables.discard_connection_filename(filename, working_filename, verbose = verbose)
833
834
835def process_files(filenames, products, live_time_program, tmp_path = None, veto_segments_name = None, verbose = False):
836 for n, filename in enumerate(filenames):
837 if verbose:
838 print("%d/%d: %s" % (n + 1, len(filenames), filename), file=sys.stderr)
839 process_file(filename, products, live_time_program, tmp_path = tmp_path, veto_segments_name = veto_segments_name, verbose = verbose)
840
841
842def write_products(products, prefix, image_formats, verbose = False):
843 format = "%%s%%0%dd%%s.%%s" % (int(math.log10(max(len(products) - 1, 1))) + 1)
844 n = 1
845 while products:
846 if verbose:
847 print("finishing product %d ..." % (n - 1), file=sys.stderr)
848 product = products.pop(0)
849 # write dump of raw data
850 filename = "%sdump_%d.pickle" % (prefix, n)
851 if verbose:
852 print("\twriting %s ..." % filename, end=' ', file=sys.stderr)
853 pickle.dump(product, open(filename, "w"))
854 if verbose:
855 print("done", file=sys.stderr)
856 # write plots
857 for m, fig in enumerate(product.finish()):
858 for ext in image_formats:
859 filename = format % (prefix, n, chr(m + 97), ext)
860 if verbose:
861 print("\twriting %s ..." % filename, end=' ', file=sys.stderr)
862 fig.savefig(filename)
863 if verbose:
864 print("done", file=sys.stderr)
865 n += 1
866
867
868options, filenames = parse_command_line()
869
870
871print("""Command line:
872
873$ %s
874""" % " ".join(sys.argv), file=sys.stderr)
875if options.open_box:
876 print("""
877
878---=== !! BOX IS OPEN !! ===---
879
880PRESS CTRL-C SOON IF YOU DIDN'T MEAN TO OPEN THE BOX
881
882""", file=sys.stderr)
883
884
885#
886# figure out which files are from injection runs and which aren't
887#
888
889
890if options.verbose:
891 print("Identifying injection and non-injection databases ...", file=sys.stderr)
892injection_filenames, noninjection_filenames = group_files(filenames, verbose = options.verbose)
893
894
895#
896# intialize the data collection objects
897#
898
899
900if options.import_dump:
901 if options.verbose:
902 print("Loading dump files ...", file=sys.stderr)
903 rate_vs_threshold, efficiency = import_dumps(options.import_dump, verbose = options.verbose)
904 # override box openness in rate-vs-threshold data
905 if rate_vs_threshold is not None:
906 rate_vs_threshold.open_box = options.open_box
907 if efficiency is not None and options.open_box and not efficiency.open_box:
908 raise ValueError("Box is open but one or more imjported efficiency dump file was generated in closed-box mode. Efficiency must be re-measured in open-box mode to use correct detection threshold.")
909else:
910 rate_vs_threshold, efficiency = None, None
911
912
913#
914# collect zero-lag and background statistics
915#
916
917
918if options.detection_threshold is None:
919 if options.verbose:
920 print("Collecting background and zero-lag statistics ...", file=sys.stderr)
921
922 children = {}
923 rate_vs_thresholds = []
924 # group files into bins of about the same total number of bytes.
925 # don't try to create more groups than there are files
926 for filenames in pack_files(noninjection_filenames, min(options.threads, len(noninjection_filenames)), verbose = options.verbose):
927 # shuffle file names to avoid copying all the big files
928 # into the scratch space at the same time
929 random.shuffle(filenames)
930 # launch thread
931 r, w = os.pipe()
932 r, w = os.fdopen(r, "r", 0), os.fdopen(w, "w", 0)
933 pid = os.fork()
934 if pid == 0:
935 # we're the child process
936 r.close()
937 try:
938 # create a new book-keeping object
939 rate_vs_threshold = RateVsThreshold(options.open_box, record_background = options.record_background, record_candidates = options.record_candidates)
940 process_files(filenames, [rate_vs_threshold], options.live_time_program, tmp_path = options.tmp_space, veto_segments_name = options.vetoes_name, verbose = options.verbose)
941 pickle.dump(rate_vs_threshold, w)
942 except:
943 pickle.dump(traceback.format_exc(), w)
944 w.close()
945 sys.exit(1)
946 w.close()
947 sys.exit(0)
948 # we're not the child process
949 w.close()
950 children[r] = pid
951 # collect all children, report any exceptions that occured, combine
952 # results
953 while children:
954 for r in select.select(children.keys(), [], [])[0]:
955 process_output = pickle.load(r)
956 r.close()
957 pid, exit_status = os.waitpid(children.pop(r), 0)
958 if isinstance(process_output, RateVsThreshold):
959 if rate_vs_threshold is None:
960 rate_vs_threshold = process_output
961 else:
962 rate_vs_threshold += process_output
963 else:
964 print(process_output, file=sys.stderr)
965 del process_output
966 if exit_status:
967 sys.exit(exit_status)
968 if rate_vs_threshold is None:
969 raise ValueError("no non-injection data available: cannot determine detection threshold. provide non-injection data or set an explicit detection treshold. Consult --help for more information.")
970
971 # write data products
972 write_products([rate_vs_threshold], "string_rate_", options.image_formats, verbose = options.verbose)
973
974 # print summary information, and set detection threshold
975 if options.open_box:
976 try:
977 print("Zero-lag events: %d" % len(rate_vs_threshold.zero_lag), file=sys.stderr)
978 except OverflowError:
979 # python < 2.5 can't handle list-like things with more than 2**31 elements
980 # FIXME: remove when we can rely on python >= 2.5
981 print("Zero-lag events: %d" % rate_vs_threshold.zero_lag.n, file=sys.stderr)
982 print("Total time in zero-lag segments: %s s" % str(rate_vs_threshold.zero_lag_time), file=sys.stderr)
983 print("Time-slide events: %d" % rate_vs_threshold.n_background, file=sys.stderr)
984 print("Total time in time-slide segments: %s s" % str(rate_vs_threshold.background_time), file=sys.stderr)
985 if options.open_box:
986 detection_threshold = rate_vs_threshold.zero_lag[-1]
987 print("Likelihood ratio for highest-ranked zero-lag survivor: %.9g" % detection_threshold, file=sys.stderr)
988 else:
989 # if the background and zero-lag live times are identical,
990 # then the loudest zero-lag event is simulated by the
991 # loudest background event so we want entry -1 in the
992 # sorted list; if the background livetime is twice the
993 # zero-lag live time then the loudest zero-lag event is
994 # simulated by the next-to-loudest background event so we
995 # want entry -2 in the sorted list.
996 detection_threshold = sorted(rate_vs_threshold.background)[-int(round(rate_vs_threshold.background_time / rate_vs_threshold.zero_lag_time))]
997 print("Simulated \\log \\Lambda for highest-ranked zero-lag survivor: %.9g" % detection_threshold, file=sys.stderr)
998else:
999 detection_threshold = options.detection_threshold
1000 print("Likelihood ratio for highest-ranked zero-lag survivor from command line: %.9g" % detection_threshold, file=sys.stderr)
1001
1002
1003#
1004# measure detection efficiency based on loudest (simulated, if box is
1005# closed) zero-lag survivor
1006#
1007
1008
1009if options.verbose:
1010 print("Collecting efficiency statistics ...", file=sys.stderr)
1011children = {}
1012# group files into bins of about the same total number of bytes. don't try
1013# to create more groups than there are files
1014for filenames in pack_files(injection_filenames, min(options.threads, len(injection_filenames)), verbose = options.verbose):
1015 # shuffle file names to avoid copying all the big files
1016 # into the scratch space at the same time
1017 random.shuffle(filenames)
1018 # launch thread
1019 r, w = os.pipe()
1020 r, w = os.fdopen(r, "r", 0), os.fdopen(w, "w", 0)
1021 pid = os.fork()
1022 if pid == 0:
1023 # we're the child process
1024 r.close()
1025 try:
1026 # create a new book-keeping object
1027 efficiency = Efficiency(detection_threshold, options.cal_uncertainty, options.injections_bin_size, options.open_box)
1028 process_files(filenames, [efficiency], options.live_time_program, tmp_path = options.tmp_space, veto_segments_name = options.vetoes_name, verbose = options.verbose)
1029 pickle.dump(efficiency, w)
1030 except:
1031 pickle.dump(traceback.format_exc(), w)
1032 w.close()
1033 sys.exit(1)
1034 w.close()
1035 sys.exit(0)
1036 # we're not the child process
1037 w.close()
1038 children[r] = pid
1039# collect all children, report any exceptions that occured, combine
1040# results
1041while children:
1042 for r in select.select(children.keys(), [], [])[0]:
1043 process_output = pickle.load(r)
1044 r.close()
1045 pid, exit_status = os.waitpid(children.pop(r), 0)
1046 if isinstance(process_output, Efficiency):
1047 if efficiency is None:
1048 efficiency = process_output
1049 else:
1050 efficiency += process_output
1051 else:
1052 print(process_output, file=sys.stderr)
1053 del process_output
1054 if exit_status:
1055 sys.exit(exit_status)
1056# write data products
1057if efficiency is not None:
1058 write_products([efficiency], "string_efficiency_", options.image_formats, verbose = options.verbose)
1059else:
1060 if options.verbose:
1061 print("no injection data available, not writing efficiency data products.", file=sys.stderr)
1062
1063if options.verbose:
1064 print("done.", file=sys.stderr)
def __init__(self, detection_threshold, cal_uncertainty, filter_width, open_box)
def add_contents(self, contents, verbose=False)
def __init__(self, open_box, record_background, record_candidates)
def add_contents(self, contents, verbose=False)
def group_files(filenames, verbose=False)
def process_files(filenames, products, live_time_program, tmp_path=None, veto_segments_name=None, verbose=False)
def upper_err(y, yerr, deltax)
def expected_count(x, background_x, background_y)
def process_file(filename, products, live_time_program, tmp_path=None, veto_segments_name=None, verbose=False)
def compress_ratevsthresh_curve(x, y, yerr)
def slope(x, y)
From the x and y arrays, compute the slope at the x co-ordinates using a first-order finite differenc...
def ratevsthresh_bounds(background_x, background_y, zero_lag_x, zero_lag_y)
def write_efficiency(fileobj, bins, eff, yerr)
def render_data_from_bins(dump_file, axes, efficiency_num, efficiency_den, cal_uncertainty, filter_width, colour="k", erroralpha=0.3, linestyle="-")
def write_products(products, prefix, image_formats, verbose=False)
def lower_err(y, yerr, deltax)
def import_dumps(filenames, verbose=False)
def pack_files(filenames, n_threads, verbose=False)