├── mc.pdf ├── xmmpn.arf ├── xmmpn.rmf ├── source.pha ├── sim.xcm ├── LICENSE ├── groupall ├── sim-rand.xcm ├── fit.xcm └── README.md /mc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonVaughanDataAndCode/xspec/HEAD/mc.pdf -------------------------------------------------------------------------------- /xmmpn.arf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonVaughanDataAndCode/xspec/HEAD/xmmpn.arf -------------------------------------------------------------------------------- /xmmpn.rmf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonVaughanDataAndCode/xspec/HEAD/xmmpn.rmf -------------------------------------------------------------------------------- /source.pha: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonVaughanDataAndCode/xspec/HEAD/source.pha -------------------------------------------------------------------------------- /sim.xcm: -------------------------------------------------------------------------------- 1 | # Generate N fake spectra from the same model 2 | # with no background file 3 | 4 | # Return TCL results for XSPEC commands. 5 | set xs_return_result 1 6 | 7 | # Keep going until fit converges. 8 | query yes 9 | 10 | # open plot device 11 | cpd /xs 12 | 13 | setplot en 14 | 15 | # Define a procedure called FAKER which actually 16 | # fakes the data 17 | proc faker {par1 par2} { 18 | fakeit none & xmmpn.rmf & xmmpn.arf & y & sim & $par1 & $par2 & /* 19 | } 20 | 21 | # Loop through all data 22 | for {set i 1} {$i < 501} {incr i} { 23 | 24 | # Set up the model. 25 | model wabs*(powerlaw) & /* 26 | newpar 1 0.123324 0.01 1.0E-4 1.0E-4 10 10 /* 27 | newpar 2 2.66865 0.01 0 0 1E+24 1E+24 /* 28 | newpar 3 1.861259E-04 1e-6 0 0 1 1 /* 29 | show 30 | 31 | # Fake the data (par1=filename par2=exposure time) 32 | faker sim_$i\.fak 5000 33 | 34 | # Plot the fake 35 | ignore **-0.3 36 | ignore 10.0-** 37 | setplot rebin 1000 25 38 | plot ldata del 39 | 40 | # Reset everything 41 | data none 42 | } 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Simon Vaughan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /groupall: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # 'Bash' script to group a whole bunch of spectra 4 | # using GRPPHA. Input parameter is the root name of the 5 | # spectral files. 6 | # Simon Vaughan, Leicester 2006 7 | # 8 | # Input files _.fak 9 | # Output files _/fak.g 10 | # 11 | # v1.0 -- 09/12/2004 -- Simon Vaughan 12 | # v1.1 -- 23/01/2006 -- fixed bug if _/fak.g 13 | # already exists. 14 | 15 | # Check command-line input 16 | 17 | if [ -z "$1" ] ; then 18 | parm1="" 19 | echo Usage: $0 $parm1 20 | exit 21 | else 22 | root=$1 23 | fi 24 | 25 | # make sure FTOOLS are initialised 26 | 27 | which grppha 2> ftools.nothere > ftools.here 28 | if [ ! -s ftools.here ] ; then 29 | echo groupall: Make sure HEAsoft is initialised 30 | exit $? 31 | fi 32 | 33 | # Make a list of all the appropriate files 34 | 35 | rm -f file.list 36 | ls -1 $root\_*.fak > file.list 2> errs.list 37 | 38 | # Check if there are zero files 39 | 40 | if [ ! -s file.list ] ; then 41 | echo groupall: Missing files $root 42 | exit $? 43 | fi 44 | 45 | # Main loop: for each file... 46 | 47 | n=0 48 | for fname in $( more file.list ); do 49 | 50 | let n=n+1 51 | 52 | # Set output filename 53 | 54 | outp=$fname\.g 55 | 56 | # remove file if it's already there 57 | 58 | if [ -e $outp ] ; then 59 | rm -rf $outp 60 | fi 61 | 62 | # Let user know what's happening 63 | 64 | echo Grouping input file: $fname output file $outp 65 | 66 | # bin spectrum (until N >= 20 ct/bin) 67 | 68 | grppha infile=$fname outfile=$outp chatter=1 \ 69 | comm="group min 20 & exit" >& grppha.log 70 | 71 | # end of loop 72 | 73 | done 74 | 75 | # Delete the listing files 76 | 77 | rm -f file.list 78 | rm ftools.nothere 79 | rm ftools.here 80 | 81 | # Finished 82 | 83 | echo All Done 84 | exit 0 85 | 86 | -------------------------------------------------------------------------------- /sim-rand.xcm: -------------------------------------------------------------------------------- 1 | # Generate fake spectra from the model that 2 | # is randomised based on the original data fit 3 | # (with no background file) 4 | # Simon Vaughan, Leicester (2006) 5 | # 6 | # Two part algorithm: 7 | # 1. Load in data. Fit. 8 | # Loop i=1,2,3,...,nn { 9 | # - draw random parameter values from current fit 10 | # - save to ASCII file } 11 | # 12 | # 2. Loop i=1,2,3,...,nn{ 13 | # - Read in the ith parameter values 14 | # - set current model to these values 15 | # - simulate data from this model } 16 | # 17 | # v1.0 based on SIM.XCM 18 | # 23/01/2006 -- v1.1 adapted for XSPEC v12 19 | # modified FAKER procedure for v12 20 | # added SIMPARS to randomise the model 21 | # 24/01/2006 -- v1.2 removed "data none" command, causes errors 22 | # due to memory leak in XSPEC v12.2.1 23 | # 27/01/2006 -- v1.3 Moved all parameters to top of script 24 | # Added parameters for Emin/Emax 25 | # 25/04/2006 -- v1.4 Added line 'data none' before call to 'faker' 26 | # to make sure the channel groupings are forgotten 27 | # prior to generating the fake the spectrum. 28 | 29 | # Parameters: 30 | # nn = number of simulations 31 | # Emin = minimum energy to fit 32 | # Emax = maximum energy to fit 33 | 34 | set nn 10 35 | set Emin 0.5 36 | set Emax 10.0 37 | 38 | # Open the file to put the results in. 39 | 40 | set fileid [open sim_result.dat w] 41 | 42 | # Keep going until fit converges. 43 | 44 | query yes 45 | 46 | # open plot device 47 | 48 | cpd /xs 49 | 50 | # Define plotting details 51 | 52 | setplot en 53 | setplot command re x 0.4 10.0 54 | setplot command re y 1e-4 0.5 55 | 56 | # Define a procedure called FAKER which actually 57 | # fakes the data 58 | 59 | proc faker {fname Texp} { 60 | fakeit none & xmmpn.rmf & xmmpn.arf & y & & ${fname} & ${Texp} & /* 61 | } 62 | 63 | # Load the ORIGINAL dataset 64 | 65 | data source.pha 66 | 67 | # Find the exposure time 68 | 69 | tclout expos 70 | set Texp $xspec_tclout 71 | 72 | # Ignore the low/high energies 73 | 74 | ignore **-$Emin 75 | ignore $Emax-** 76 | ignore bad 77 | 78 | # Set up the null hypothesis model. 79 | 80 | model wabs*(powerlaw) & /* 81 | newpar 1 0.123324 0.01 1.0E-4 1.0E-4 10 10 82 | newpar 2 2.66865 0.01 0 0 1E+24 1E+24 83 | newpar 3 1.861259E-04 1e-6 0 0 1 1 84 | show 85 | 86 | # run the FIT command to calculate covariance matrix 87 | 88 | fit 89 | 90 | # plot the data 91 | 92 | setplot command la f Original data 93 | plot ld del 94 | 95 | # ----------------------------------- 96 | # Loop through interations i = 1,2,3, ...,nn 97 | 98 | for {set i 1} {$i <= $nn} {incr i} { 99 | 100 | # draw at random new parameter values using 101 | # the covariance matrix to define the distribution 102 | 103 | tclout simpars 104 | 105 | # save the randomised parameter values into the file 106 | 107 | puts $fileid "$xspec_tclout" 108 | 109 | # end of this loop 110 | 111 | } 112 | 113 | # Close the file. 114 | 115 | close $fileid 116 | 117 | # ----------------------------------- 118 | # open the list of randomised parameters 119 | 120 | set fileid [open sim_result.dat r] 121 | 122 | # Loop through interations i = 1,2,3, ...,nn 123 | 124 | for {set i 1} {$i <= $nn} {incr i} { 125 | 126 | # read in one set of randomised parameters from file 127 | 128 | set parms [gets $fileid] 129 | set par1 [lindex $parms 0] 130 | set par2 [lindex $parms 1] 131 | set par3 [lindex $parms 2] 132 | 133 | # define a new model using randomised parameters 134 | 135 | newpar 1 $par1 136 | newpar 2 $par2 137 | newpar 3 $par3 138 | 139 | # show the randomised model 140 | 141 | show 142 | 143 | # remove the data (to forget channel groupings) 144 | 145 | data none 146 | 147 | # Fake the data (parameters: filename, exposure time) 148 | 149 | faker sim_$i\.fak $Texp 150 | 151 | # Plot the fake data 152 | 153 | ignore **-$Emin 154 | ignore $Emax-** 155 | setplot rebin 5 250 156 | 157 | setplot command la f Simulation number: $i 158 | plot ldata del 159 | 160 | # Reset everything 161 | 162 | data none 163 | 164 | # end of this loop 165 | 166 | } 167 | 168 | # end of script 169 | 170 | exit 171 | 172 | -------------------------------------------------------------------------------- /fit.xcm: -------------------------------------------------------------------------------- 1 | # Fit fake spectra with the same model and 2 | # output results to ASCII file called 'fit_results.dat' 3 | # Simon Vaughan, Leicester (2006) 4 | # 5 | # Data to be fitted are called sim_.fak.g 6 | # where = 1, 2, 3, ..., nn 7 | # 8 | # Fits with two models - model 1 is simpler; 9 | # model 2 is more complex - and outputs the 10 | # fit statistic (chi-squared) for each fit 11 | # 12 | # 23/01/2006 -- v1.1 adapted for XSPEC v12 13 | # changed error command syntax 14 | # using only 1 error call per fit 15 | # 26/01/2006 -- v1.2 added check that DOF > 1 16 | # 27/01/2006 -- v1.3 Moved all parameters to top of script 17 | # Added parameters for Emin/Emax 18 | # 31/01/2006 -- v1.4 Added routine to step through trial 19 | # values of new parameter and use value 20 | # giving min[chi^2] as first guess for fitting 21 | # 02/02/2006 -- v1.5 Minor improvements. Set nE=100 which 22 | # gives better results. Added check for 23 | # non-monotonicity in proc 'shakefit'. 24 | # Output F-test results for each spectrum. 25 | # 05/06/2006 -- v1.6 Revised 'shakefit' procedure to check for 26 | # parameters fitting hard limits and bail out 27 | # after 100 iterations (prevent infinite loop). 28 | # 29 | # WARNING: Bug in XSPEC prior to v12.2.1w gives dodgy F-test probabilities! 30 | # ---------------------------------------------------------- 31 | 32 | # Parameters: 33 | 34 | set nn 10 ;# nn = number of simulations 35 | set Emin 0.5 ;# Emin = minimum energy to fit 36 | set Emax 10.0 ;# Emax = maximum energy to fit 37 | set nE 100 ;# nE = number of energies to step through 38 | 39 | # ---------------------------------------------------------- 40 | # Define a TCL procedure to find minimum element of array 41 | # input is 'list' output is the position of 42 | # the minimum value of 'list'. 43 | # Internally: $i is a counter 44 | # $value is the value of the ith element of list 45 | # $minval is the minimum value found so far 46 | # $minpos is the position of current minimum 47 | 48 | proc min { list } { 49 | set n [llength $list] 50 | set minpos 0 51 | set minval [lindex $list 0] 52 | for {set i 0} {$i < $n} {incr i} { 53 | set value [lindex $list $i] 54 | if {$value < $minval} { 55 | set minval $value 56 | set minpos $i 57 | } 58 | } 59 | return $minpos 60 | } 61 | 62 | # ---------------------------------------------------------- 63 | # Define a TCL procedure to refine fitting results 64 | # by repeated use of 'error' and 'fit' 65 | # The main loop runs over all parameters. For each free 66 | # parameter perform at least one 'error' command to 67 | # 'shake' it out of local minima. Keep fitting the parameter 68 | # until 'error' does not find a new minimum. 69 | # Finish once all free parameters have been shaken. 70 | # 71 | # $erroout comprises nine T/F flags 72 | # If the first flag is TRUE then a new minimum was 73 | # found during the last error command 74 | # 75 | # error stopat max 76 | 77 | proc shakefit {} { 78 | tclout modpar ;# find number of parameters 79 | set nopar $xspec_tclout 80 | for {set j 1} {$j <= $nopar} {incr j} { 81 | tclout param $j 82 | set pdel [lindex $xspec_tclout 1] ;# get parameter delta 83 | if {$pdel < 0} continue ;# if frozen goto next param 84 | set doerror 1 85 | set delchi 2.706 ;# delta-chi^2 to probe 86 | set counter 0 87 | while {$doerror == 1 && $counter < 100} { 88 | incr counter 89 | error stopat 10 0.1 max 50.0 $delchi $j 90 | tclout error $j 91 | set errout [lindex $xspec_tclout 2] 92 | if {[string match ???T????? $errout] || [string match ????T???? $errout]} { 93 | set doerror 0 ;# Hit lower/upper limits 94 | } 95 | if [string match F???????? $errout] { 96 | set doerror 0 ;# Not found better fit 97 | } else { 98 | fit 100 0.01 ;# Found better fit 99 | if [string match ?T??????? $errout] { 100 | set delchi [expr $delchi + 2] ;# increase if non-monotonic 101 | } ;# End IF (?F) 102 | } ;# End IF (F?) 103 | } ;# End WHILE 104 | } ;# End FOR 105 | } ;# End PROC 106 | 107 | # ---------------------------------------------------------- 108 | 109 | query no 110 | 111 | # fitting method: level/migrad 112 | 113 | method leven 114 | 115 | # open plot device 116 | 117 | cpd /xs 118 | 119 | # Define plotting details 120 | 121 | setplot energy 122 | setplot add 123 | setplot command re x 0.4 5.0 124 | setplot command re y 1e-4 0.5 125 | 126 | # Open the file to put the results in. 127 | 128 | set fileout [open fit_result.dat w] 129 | 130 | # ---------------------------------------------------------- 131 | # Loop through all data from 132 | 133 | for {set i 1} {$i <= $nn} {incr i} { 134 | 135 | # load the grouped spectral file 136 | # called sim_.fak.g (where i=1,2,...,nn) 137 | 138 | data sim_$i.fak.g 139 | 140 | # Ignore the low/high energies - exactly as real data 141 | 142 | ignore **-$Emin 143 | ignore $Emax-** 144 | ignore bad 145 | 146 | # Set up the initial null hypothesis (simple) model. 147 | 148 | model wabs*(powerlaw) & /* 149 | newpar 1 0.123324 0.01 1.0E-4 1.0E-4 10 10 150 | newpar 2 2.66865 0.01 0 0 1E+24 1E+24 151 | newpar 3 1.861259E-04 1e-6 0 0 1 1 152 | 153 | # Check there are enough degrees of freedom to fit 154 | 155 | tclout dof 156 | set tdof [lindex $xspec_tclout 0] 157 | if {$tdof < 3} { 158 | puts "** Not enough degrees of freedom" 159 | continue 160 | } 161 | 162 | # Fit it to the model. 163 | 164 | fit 100 0.01 165 | 166 | # Make sure there's no better minimum (using 'shakefit' procedure) 167 | 168 | shakefit 169 | 170 | # Get the fit statistic and DOF 171 | 172 | tclout stat 173 | set chi1 $xspec_tclout 174 | tclout dof 175 | set dof1 [lindex $xspec_tclout 0] 176 | 177 | # Plot the final fit 178 | 179 | setplot command la t Model 1 spectrum: $i 180 | setplot command la f chi-squared $chi1 / $dof1 dof 181 | plot ldata 182 | 183 | # ---------------------------------------------------------- 184 | # ----------- INSERT BB ------------------------------------ 185 | # add the extra component being tested for (e.g. BB) 186 | # using EDITMOD command to insert the new component 187 | # 188 | # editmod wabs*(powerlaw+bb) & /* 189 | # newpar 4 0.222110 0.01 0 0 1 1 190 | # newpar 5 2.634640E-06 1e-7 0 0 1 1 191 | # show 192 | 193 | # ----------- INSERT LINE AT LARGEST DELTA-CHI^2 ----------- 194 | 195 | # Find lowest/highest energy data actually used 196 | # nchan = number of channels; E/dE = binenergy/width 197 | 198 | tclout dof 199 | set nchan [expr [lindex $xspec_tclout 1] - 1] 200 | tclout plot ldata x 201 | set E $xspec_tclout 202 | tclout plot ldata xerr 203 | set dE $xspec_tclout 204 | set Ehi [expr [lindex $E $nchan] + [lindex $dE $nchan]] 205 | set nchan 0 206 | set Elo [expr [lindex $E $nchan] - [lindex $dE $nchan]] 207 | 208 | # Insert zero-flux line in spectral model 209 | 210 | editmod wabs*(powerlaw+gaussian) & /* 211 | newpar 4,$Elo,0.01,$Emin,$Elo,$Ehi,$Emax 212 | newpar 5,0.00,0.01 0.0 0.0 1.0 1.0 213 | newpar 6,0.00,1e-7 0.0 0.0 1.0 1.0 214 | freeze 5 ;# Ensure line stays narrow 215 | 216 | # Step through nE energies finding chi-squared at each one 217 | # use logarithmic energy steps from lowest to highest energy 218 | # resetting model each step ('best' rather than 'current') 219 | 220 | steppar best log 4 $Elo $Ehi $nE 221 | 222 | # put steppar output into lists 223 | 224 | tclout steppar statistic ;# chi-squared values 225 | set chisq $xspec_tclout 226 | tclout steppar 4 ;# corresponding parameter values 227 | set trialE $xspec_tclout 228 | 229 | # Put line at energy that gave minimum chi-squared 230 | 231 | set minindex [min $chisq] 232 | set Epeak [lindex $trialE $minindex] 233 | newpar 4,$Epeak,0.01,$Emin,$Emin,$Emax,$Emax 234 | newpar 6,0.00,1e-7 0.0 0.0 1.0 1.0 235 | 236 | # Before fitting, adjust line norm at current energy 237 | 238 | freeze 4 239 | fit 100 0.01 240 | thaw 4 241 | 242 | # Now fit it to the model to improve energy and norm 243 | 244 | fit 100 0.01 245 | 246 | # Make sure there's no better minimum (using 'shakefit' procedure) 247 | 248 | shakefit 249 | 250 | # Get the fit statistic and DOF 251 | 252 | tclout stat 253 | set chi2 $xspec_tclout 254 | tclout dof 255 | set dof2 [lindex $xspec_tclout 0] 256 | 257 | # Plot the final fit 258 | 259 | setplot command la t Model 2 spectrum: $i 260 | setplot command la f chi-squared $chi2 / $dof2 dof 261 | plot ldata 262 | 263 | # Perform an F-test (for the hell of it!) 264 | 265 | ftest $chi2 $dof2 $chi1 $dof1 266 | tclout ftest 267 | set fprob $xspec_tclout 268 | 269 | # Put the chi-square and DOF of each fit into file 270 | 271 | puts $fileout "$i $chi1 $dof1 $chi2 $dof2 $fprob" 272 | 273 | # Reset everything 274 | 275 | data none 276 | model none 277 | 278 | # end of loop 279 | 280 | } 281 | # ---------------------------------------------------------- 282 | 283 | # Close the file. 284 | 285 | close $fileout 286 | 287 | # end of script 288 | 289 | exit 290 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # xspec 2 | This page gives XSPEC (and shell) scripts for performing Monte Carlo tests within XSPEC. 3 | 4 | * source.pha - The "real" dataset for the worked example 5 | * xmmpn.rmf - The response matrix for the data 6 | * xmmpn.arf - The ancillary response file for the data 7 | * sim-rand.xcm - XSPEC script to produce simulated spectra 8 | * groupall - Bash script to run GRPPHA over all simulated spectra 9 | * fit.xcm - XSPEC script to fit all (grouped) simulated spectra 10 | 11 | Read the [mc.pdf](mc.pdf) file for my (circa 2006) explanation. 12 | 13 | ## Warning 14 | 15 | These scripts were written 2005-2006, tested using XSPEC v12.2.1ao running under Scientific Linux 4.2. They have not been maintained since then. I offer no guarantee they will work on other systems. 16 | 17 | ## Referencing the scripts 18 | 19 | If you make use of this code in your work, please do cite the following paper for which these scripts were originally developed. 20 | 21 | [Hurkett, C., Vaughan S., et al. 2008, ApJ, v679, p587](http://adsabs.harvard.edu/abs/2008ApJ...679..587H) 22 | 23 | ## Explanation 24 | 25 | ### Monte Carlo methods 26 | 27 | Monte Carlo methods use random (or quasi-random) data to solve 28 | problems. In the context of hypothesis testing, one 29 | generates randomised data based on the null hypothesis model, 30 | taking care to make the fake data as realistic as possible, 31 | and uses them as a 'control' sample with which to calibrate the 32 | test statistic. In the source detection example one would simulate 33 | a large ensemble of images, by randomising the background level, 34 | and measure the flux at the source position in each one. 35 | The frequency distribution (histogram) of the fluxes produced from 36 | the fake data 37 | is a Monte Carlo estimate of the reference PDF. 38 | As well as making sure the data are simulated as accurately as 39 | possible one must also simulate a large number of datasets 40 | so that the histogram of the test statistics from the simulations 41 | converges on the true PDF. Even if there is 42 | no analytical expression for the reference distribution, it is 43 | always possible to find it using the Monte Carlo method so long 44 | as one can simulate a sufficient number of (realistic) fake data. 45 | 46 | Monte Carlo methods are extremely powerful and conceptually simple. 47 | The drawback is that they may require a large amount of computer 48 | processing time to generate and analyse a large quantity of simulated 49 | data. 50 | 51 | ### Application to the F-test 52 | 53 | Maybe you are trying to test for an emission line in a spectrum; 54 | adding the line to the model improves the fit a bit, but you don't 55 | know whether the improvement should be considered significant or 56 | not. 57 | You could use the F-test but one of the 58 | assumptions behind the F-test is not valid in this case [1]. Normally you 59 | would measure the F-statistic and compare this with 60 | a reference distribution 61 | -- this tells you how unexpected your value of 62 | F is. (In this case the reference distribution for the F-test is the 63 | Fisher-Snedecor distribution.) 64 | The reference distribution gives you a probability, or p-value, for 65 | the given value of F. 66 | If the probability is small (let's say 67 | p=0.001) then you conclude this result is unlikely to have occurred by 68 | chance, so it must be a significant detection (people often quote this 69 | by inverting the false alarm probability: 100*[1-p] = 99.9 per 70 | cent confidence). But, as we just said, the case of adding a line 71 | violates one of the fundamental assumptions behind the F-test and so 72 | you cannot use the textbook reference 73 | distribution to go from an F-value (what you measure from the data) to 74 | a p-value (how significant it is). But we can solve the problem 75 | using a Monte Carlo approach. 76 | 77 | ### An overview of the method 78 | 79 | The general idea is as follows. First you need to define exactly what 80 | it is you want to test. If you want a clear answer you need a clear 81 | question! In the case of line detection, perhaps you are comparing 82 | a power law to a power law plus an emission line. 83 | The null 84 | hypothesis is that the simpler of these two models is true -- 85 | in this example the 86 | null hypothesis is that the spectrum is just a power law. The 87 | alternative hypothesis is that the spectrum is a power law plus an 88 | emission line. The way you go about making a hypothesis test is to 89 | measure some test statistic from the data. Maybe you used the F-test 90 | and measured an F-value. (The F-value comes from the decrease in 91 | chi-square when you add a line to the model, and the number of 92 | degrees of freedom.) But you don't know the reference distribution to 93 | turn this into a probability. What you need to do is make a large 94 | batch of fake data for which the null hypothesis is true, and measure 95 | the same test statistic for each of the fake datasets. So you make a 96 | fake dataset, measure the test statistic, make another one, measure 97 | the test statistic, etc. etc. If you keep doing this you will build up 98 | the distribution of the test statistic assuming the null hypothesis is 99 | true (because all your fake data are produced using the null 100 | hypothesis). In the 'power law vs. power law plus line' example what 101 | we do is simulate a spectrum of a power law, then fit the data using a 102 | power law with and without a line, and then measure the F-value. Over 103 | and over again. As you perform more simulations you build up a clearer 104 | picture of the distribution of F-values (if the null hypothesis is 105 | true). 106 | 107 | You can then 108 | ask the question: how many simulations show a larger test 109 | statistic (e.g. F-value) than the one I got for my real data? Did 110 | the value I got 111 | for my test statistic appear in many of the simulations or only 112 | very rarely? Maybe the F-value was 6.71 from the real data. And when 113 | we ran the 1,000 simulations we found only 3 out of the 1,000 had a 114 | value bigger than this. 115 | We could conclude there is a 3/1000 chance of getting an F-value like 116 | the one observed if the null hypothesis is true. 117 | This is the false alarm probability, and since it is quite small 118 | we may interpret it as indicating the 119 | null hypothesis is false, and we therefore favour the 120 | alternative hypothesis. 121 | In other words we could say the line is 122 | detected at 99.7 per cent confidence (because 997/1000 simulations 123 | showed a smaller F-value). 124 | 125 | ### Outline of a simple MC method 126 | 127 | A simple Monte Carlo significance test works along the following lines: 128 | 129 | 1. Define the null and alternative hypotheses 130 | 131 | 2. Choose a test statistic: call it T 132 | 133 | 3. Measure the test statistic of the real data: call it T_0 134 | 135 | 4. Definite loop. For each i=1,2,...,N: 136 | 137 | a. Produce a simulated data set: D_i 138 | 139 | b. Measure the test statistic from the simulated data: T_i 140 | 141 | 5. Calculate where T_0 falls in the distribution of T_i 142 | 143 | The p-value is fraction of the T_i values that exceed the measured 144 | T_0 value: p = n[ T_i >= T_0 ]/N. Inverting this the significance 145 | is 1-p = n[ T_0 > T_i ]/N. Ensure 146 | N is large or this will not be a very accurate representation (the 147 | error on the p value is sqrt[p(1-p)/N], which comes from the 148 | binomial formula). 149 | 150 | It is vital that you make a *fair* measurement of the 151 | test statistic from the simulated data -- you must be careful not to 152 | bias this measurement 153 | based on your prior experience of the real data. 154 | What ever you did to the real data, you must also do to the 155 | simulated ('control') data, otherwise you are not performing 156 | a fair like-for-like test. 157 | 158 | ### Script files for XSPEC 159 | 160 | That's the theory dealt with. Now for a worked example of 161 | running a Monte Carlo test using XSPEC. 162 | 163 | XSPEC will allow you to run a sequence of commands from a script. This 164 | means you can automate the process of generating and fitting a 165 | large sequence of fake data. If you are trying to examine 1,000 spectral 166 | simulations this really is the only way to do it. Basically all you do 167 | is write the appropriate commands into an XSPEC script file '\*.xcm' 168 | and then you can run it from within XSPEC. XSPEC uses 169 | TCL (Tool Command Language) to control its operation, so you can also 170 | add basic 171 | control structures (like loops) and input/output commands to your 172 | script. In this way an XSPEC script with a few TCL commands is 173 | quite a powerful tool. 174 | 175 | A slight technical problem is that you may need more than just 176 | XSPEC. If your real data were grouped have have 20 counts per bin, 177 | then you must do this to your fake data too. But this needs 178 | to be done by GRPPHA -- outside of XSPEC. What I do is break down the 179 | process into a set of basic tasks, each of which has its own script. 180 | 181 | 1. I use one 182 | XSPEC script to produce N fake datasets. 183 | 2. Then I use a shell script 184 | (from the UNIX command line, outside of XSPEC) to run GRPPHA on each 185 | of the fake data files. 186 | 3. Then I have another XSPEC script to load each 187 | of the (now grouped!) fake data files in turn, fit them, and save the 188 | results to a text file. 189 | 4. Then I use an R script, or a Fortran program 190 | (or whatever) to examine the results and see how the simulated 191 | distribution of the test statistic compares to the 'real' number. 192 | 193 | There are a number of ways to run an XSPEC script like this. One is 194 | to simply use the '@' symbol, like you would a normal XSPEC '\*.xcm' file. 195 | ``` 196 | xspec> @script.xcm 197 | ``` 198 | A better way is to use the following from the UNIX command line 199 | ``` 200 | unix> xspec - script.xcm 201 | ``` 202 | This will start XSPEC and run the script. If you end the script 203 | with an *exit* command it will then return you to the UNIX command line. 204 | 205 | 206 | 207 | [1]: There are two conditions that must be satisfied for the F-test to 208 | follow its expected theoretical reference distribution. These are that 209 | the two models being compared are *nested*, and that the null 210 | values of the additional parameters are not on the boundary of 211 | possible parameter space. This second condition is violated when 212 | testing for a line (or any other additive component) because the null 213 | value of one of the new parameters (normalisation) is zero, which is 214 | the boundary of the parameter space. 215 | You should also have enough counts per bin to be able to use chi-square 216 | properly as well. (Or use direct maximum likelihood fitting.) 217 | See [Protassov et al. (2002)](http://adsabs.harvard.edu/abs/2002ApJ...571..545P). 218 | --------------------------------------------------------------------------------