ProteoWizard
MSDataFileTest.cpp
Go to the documentation of this file.
1 //
2 // $Id: MSDataFileTest.cpp 6141 2014-05-05 21:03:47Z chambm $
3 //
4 //
5 // Original author: Darren Kessner <darren@proteowizard.org>
6 //
7 // Copyright 2007 Spielberg Family Center for Applied Proteomics
8 // Cedars-Sinai Medical Center, Los Angeles, California 90048
9 //
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 //
14 // http://www.apache.org/licenses/LICENSE-2.0
15 //
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 // See the License for the specific language governing permissions and
20 // limitations under the License.
21 //
22 
23 
24 #include "MSDataFile.hpp"
25 #include "Diff.hpp"
26 #include "IO.hpp"
27 #include "SpectrumListBase.hpp"
28 #include "ChromatogramListBase.hpp"
29 #include "examples.hpp"
33 #include <boost/iostreams/filtering_stream.hpp>
34 #include <boost/iostreams/filter/gzip.hpp>
35 #include <boost/iostreams/device/file_descriptor.hpp>
36 #include <boost/iostreams/copy.hpp>
37 
38 
39 using namespace pwiz::util;
40 using namespace pwiz::cv;
41 using namespace pwiz::data;
42 using namespace pwiz::msdata;
43 
44 
45 ostream* os_ = 0;
46 
47 
48 string filenameBase_ = "temp.MSDataFileTest";
49 
50 
52 {
53  // remove metadata ptrs appended on read
54  vector<SourceFilePtr>& sfs = msd.fileDescription.sourceFilePtrs;
55  if (!sfs.empty()) sfs.erase(sfs.end()-1);
56  vector<SoftwarePtr>& sws = msd.softwarePtrs;
57  if (!sws.empty()) sws.erase(sws.end()-1);
58 
59  // remove current DataProcessing created on read
60  SpectrumListBase* sl = dynamic_cast<SpectrumListBase*>(msd.run.spectrumListPtr.get());
61  ChromatogramListBase* cl = dynamic_cast<ChromatogramListBase*>(msd.run.chromatogramListPtr.get());
64 }
65 
67 {
68  string filename1 = filenameBase_ + ".mgf";
69  string filename2 = filenameBase_ + ".mzXML";
70 
71  ofstream ofs(filename1.c_str());
72  string mgf = "CHARGE=2+ and 3+\nBEGIN IONS\nPEPMASS=952.924194 145032.0000\nCHARGE=2+\nRTINSECONDS=301.48\n271.0874 2\n298.1747 4\nEND IONS\nBEGIN IONS\nPEPMASS=503.800000 67522.2000\nCHARGE=2+\nRTINSECONDS=302.51\n147.1840 3\n154.3668 3\n162.2118 2\n162.9007 1\n167.3297 1\n175.2387 2\n184.9460 3\nEND IONS\n";
73  ofs.write(mgf.c_str(), mgf.length());
74  ofs.close();
75 
76  // make sure that round trip doesn't systematically increase converted scan numbers
77  for (int loop = 3; loop--; )
78  {
79  MSDataFile msd1(filename1); // read back the MGF
80  const SpectrumList& sl = *msd1.run.spectrumListPtr;
81  SpectrumPtr spectrum = sl.spectrum(0);
82  unit_assert(spectrum->id == "index=0");
83  MSDataFile::WriteConfig writeConfig;
84  writeConfig.format = MSDataFile::Format_mzXML;
85  MSDataFile::write(msd1, filename2, writeConfig); // write as mzXML
86  MSDataFile msd2(filename2); // read back the mzXML
87  const SpectrumList& sl2= *msd2.run.spectrumListPtr;
88  SpectrumPtr spectrum2 = sl2.spectrum(0);
89  unit_assert(spectrum2->id == "index=1"); // mzXML is 1-based
90  MSDataFile::WriteConfig writeConfig2;
91  writeConfig2.format = MSDataFile::Format_MGF;
92  MSDataFile::write(msd2, filename1, writeConfig2); // write as mgf
93  }
94 
95  // remove temp files
96  boost::filesystem::remove(filename1);
97  boost::filesystem::remove(filename2);
98 }
99 
100 
102  const DiffConfig diffConfig)
103 {
104  if (os_) *os_ << "validateWriteRead()\n " << writeConfig << endl;
105 
106  string filename1 = filenameBase_ + ".1";
107  string filename2 = filenameBase_ + ".2";
108  string filename3 = filenameBase_ + ".3";
109  string filename4 = filenameBase_ + ".\xE4\xB8\x80\xE4\xB8\xAA\xE8\xAF\x95.4";
110  // FIXME: 4-byte UTF-8 not working: string filename5 = filenameBase_ + ".\x01\x04\xA4\x01\x04\xA2.5";
111 
112  {
113  // create MSData object in memory
114  MSData tiny;
116 
117  if (writeConfig.format == MSDataFile::Format_mzXML)
118  {
119  // remove s22 since it is not written to mzXML
120  static_cast<SpectrumListSimple&>(*tiny.run.spectrumListPtr).spectra.pop_back();
121  }
122 
123  // write to file #1 (static)
124  MSDataFile::write(tiny, filename1, writeConfig);
125 
126  // simulate CLI garbage collect behavior, wherein delayed deletes stress
127  // memory and file handle usage
128  {
129  std::vector< boost::shared_ptr< MSDataFile > > msds;
130  for (int i=0;i<100;i++)
131  {
132  boost::shared_ptr<MSDataFile> msd1(new MSDataFile(filename1));
133  msds.push_back(msd1);
134  hackInMemoryMSData(*msd1);
135  Diff<MSData, DiffConfig> diff(tiny, *msd1, diffConfig);
136  }
137  }
138 
139  // read back into an MSDataFile object
140  MSDataFile msd1(filename1);
141  hackInMemoryMSData(msd1);
142 
143  // compare
144  Diff<MSData, DiffConfig> diff(tiny, msd1, diffConfig);
145  if (diff && os_) *os_ << diff << endl;
146  unit_assert(!diff);
147 
148  // write to file #2 (member)
149  msd1.write(filename2, writeConfig);
150 
151  // read back into another MSDataFile object
152  MSDataFile msd2(filename2);
153  hackInMemoryMSData(msd2);
154 
155  // compare
156  diff(tiny, msd2);
157  if (diff && os_) *os_ << diff << endl;
158  unit_assert(!diff);
159 
160  // now give the gzip read a workout
161  bio::filtering_istream tinyGZ(bio::gzip_compressor() | bio::file_descriptor_source(filename1));
162  bio::copy(tinyGZ, bio::file_descriptor_sink(filename1+".gz", ios::out|ios::binary));
163 
164  MSDataFile msd3(filename1+".gz");
165  hackInMemoryMSData(msd3);
166 
167  // compare
168  diff(tiny, msd3);
169  if (diff && os_) *os_ << diff << endl;
170  unit_assert(!diff);
171 
172  // test writing to a stream
173  ostringstream oss;
174  msd1.write(oss, writeConfig);
175  string ossStr = oss.str();
176  ofstream ofs(filename3.c_str());
177  ofs.write(ossStr.c_str(), ossStr.length());
178  ofs.close();
179 
180  // read back into another MSDataFile object
181  MSDataFile msd4(filename3);
182  hackInMemoryMSData(msd4);
183 
184  // compare
185  diff(tiny, msd4);
186  if (diff && os_) *os_ << diff << endl;
187  unit_assert(!diff);
188 
189 
190  // write to file #4 (testing two byte UTF-8 code points)
191  msd1.write(filename4, writeConfig);
192 
193  // read back into another MSDataFile object
194  MSDataFile msd5(filename4);
195  hackInMemoryMSData(msd5);
196 
197  // compare
198  diff(tiny, msd5);
199  if (diff && os_) *os_ << diff << endl;
200  unit_assert(!diff);
201 
202 
203  // write to file #5 (testing four byte UTF-8 code points)
204  /*msd1.write(filename5, writeConfig);
205 
206  // read back into another MSDataFile object
207  MSDataFile msd6(filename5);
208  hackInMemoryMSData(msd6);
209 
210  // compare
211  diff(tiny, msd6);
212  if (diff && os_) *os_ << diff << endl;
213  unit_assert(!diff);*/
214  }
215 
216  // remove temp files
217  boost::filesystem::remove(filename1);
218  boost::filesystem::remove(filename2);
219  boost::filesystem::remove(filename1 + ".gz");
220  boost::filesystem::remove(filename3);
221  boost::filesystem::remove(filename4);
222  //boost::filesystem::remove(filename5);
223 }
224 
225 void test()
226 {
227  MSDataFile::WriteConfig writeConfig;
228  DiffConfig diffConfig;
229 
231 
232  // mzML 64-bit, full diff
233  validateWriteRead(writeConfig, diffConfig);
234 
235  writeConfig.indexed = false;
236  validateWriteRead(writeConfig, diffConfig); // no index
237  writeConfig.indexed = true;
238 
239  // mzML 32-bit, full diff
240  writeConfig.binaryDataEncoderConfig.precision = BinaryDataEncoder::Precision_32;
241  validateWriteRead(writeConfig, diffConfig);
242 
243  // mzXML 32-bit, diff ignoring metadata and chromatograms
244  writeConfig.format = MSDataFile::Format_mzXML;
245  diffConfig.ignoreMetadata = true;
246  diffConfig.ignoreChromatograms = true;
247  validateWriteRead(writeConfig, diffConfig);
248 
249  // mzXML 64-bit, diff ignoring metadata and chromatograms
250  writeConfig.binaryDataEncoderConfig.precision = BinaryDataEncoder::Precision_64;
251  validateWriteRead(writeConfig, diffConfig);
252 
253  writeConfig.indexed = false;
254  validateWriteRead(writeConfig, diffConfig); // no index
255  writeConfig.indexed = true;
256 }
257 
258 
259 void demo()
260 {
261  MSData tiny;
263 
265  MSDataFile::write(tiny, filenameBase_ + ".64.mzML", config);
266 
267  config.binaryDataEncoderConfig.precision = BinaryDataEncoder::Precision_32;
268  MSDataFile::write(tiny, filenameBase_ + ".32.mzML", config);
269 
271  MSDataFile::write(tiny, filenameBase_ + ".txt", config);
272 
274  MSDataFile::write(tiny, filenameBase_ + ".32.mzXML", config);
275 
276  config.binaryDataEncoderConfig.precision = BinaryDataEncoder::Precision_64;
277  MSDataFile::write(tiny, filenameBase_ + ".64.mzXML", config);
278 }
279 
280 
281 const char rawHeader_[] = {'\x01', '\xA1',
282  'F', '\0', 'i', '\0', 'n', '\0', 'n', '\0',
283  'i', '\0', 'g', '\0', 'a', '\0', 'n', '\0'};
284 
285 
286 class TestReader : public Reader
287 {
288  public:
289 
290  TestReader() : count(0) {}
291 
292  virtual std::string identify(const std::string& filename, const std::string& head) const
293  {
294  if (filename.size()<=4 || filename.substr(filename.size()-4)!=".RAW")
295  return std::string("");
296 
297  for (size_t i=0; i<sizeof(rawHeader_); i++)
298  if (head[i] != rawHeader_[i])
299  return std::string("");
300 
301  count++;
302  return filename;
303  }
304 
305  virtual void read(const std::string& filename, const std::string& head, MSData& result, int runIndex = 0,
306  const Config& config = Config()) const
307  {
308  count++;
309  }
310 
311  virtual void read(const std::string& filename,
312  const std::string& head,
313  std::vector<MSDataPtr>& results,
314  const Config& config = Config()) const
315  {
316  results.push_back(MSDataPtr(new MSData));
317  read(filename, head, *results.back(), 0, config);
318  }
319 
320  const char *getType() const {return "testReader";} // satisfy inheritance
321 
322  mutable int count;
323 };
324 
325 
327 {
328  // create a file
329  string filename = filenameBase_ + ".RAW";
330  ofstream os(filename.c_str());
331  os.write(rawHeader_, 18);
332  os.close();
333 
334  // open the file with our Reader
335  TestReader reader;
336  MSDataFile msd(filename, &reader);
337 
338  // verify that our reader got called properly
339  unit_assert(reader.count == 2);
340 
341  // remove temp file
342  boost::filesystem::remove(filename);
343 
344  if (os_) *os_ << endl;
345 }
346 
347 
348 void testSHA1()
349 {
350  if (os_) *os_ << "testSHA1()\n";
351 
352  // write out a test file
353 
354  string filename = filenameBase_ + ".SHA1Test";
355  MSData tiny;
357  MSDataFile::write(tiny, filename);
358 
359  {
360  // read in without SHA-1 calculation
361  MSDataFile msd(filename);
362 
363  if (os_)
364  {
365  *os_ << "no SHA-1:\n";
367  IO::write(writer, *msd.fileDescription.sourceFilePtrs.back());
368  }
369 
371  unit_assert(!msd.fileDescription.sourceFilePtrs.back()->hasCVParam(MS_SHA_1));
372 
373  // read in with SHA-1 calculation
374 
375  MSDataFile msd_sha1(filename, 0, true);
376 
377  if (os_)
378  {
379  *os_ << "with SHA-1:\n";
381  IO::write(writer, *msd_sha1.fileDescription.sourceFilePtrs.back());
382  }
383 
384  unit_assert(!msd_sha1.fileDescription.sourceFilePtrs.empty());
385  unit_assert(msd_sha1.fileDescription.sourceFilePtrs.back()->hasCVParam(MS_SHA_1));
386  }
387 
388  // clean up
389 
390  boost::filesystem::remove(filename);
391  if (os_) *os_ << endl;
392 }
393 
394 
395 int main(int argc, char* argv[])
396 {
397  TEST_PROLOG(argc, argv)
398 
399  try
400  {
401  if (argc>1 && !strcmp(argv[1],"-v")) os_ = &cout;
402  test();
403  //demo();
404  testReader();
405  testSHA1();
406  }
407  catch (exception& e)
408  {
409  TEST_FAILED(e.what())
410  }
411  catch (...)
412  {
413  TEST_FAILED("Caught unknown exception.")
414  }
415 
417 }
418 
common functionality for base SpectrumList implementations
common functionality for base ChromatogramList implementations
The XMLWriter class provides simple, tag-level XML syntax writing.
Definition: XMLWriter.hpp:47
std::vector< SourceFilePtr > sourceFilePtrs
list and descriptions of the source files this mzML document was generated or derived from...
Definition: MSData.hpp:89
virtual void setDataProcessingPtr(DataProcessingPtr dp)
set DataProcessing
ChromatogramListPtr chromatogramListPtr
all chromatograms for this run.
Definition: MSData.hpp:826
virtual std::string identify(const std::string &filename, const std::string &head) const
Format_Text
bool ignoreMetadata
ignore all file level metadata, and most scan level metadata, i.e.
Definition: Diff.hpp:214
boost::shared_ptr< Spectrum > SpectrumPtr
Definition: MSData.hpp:569
virtual void setDataProcessingPtr(DataProcessingPtr dp)
set DataProcessing
Calculate diffs of objects in a ProteoWizard data model hierarchy.
Definition: diff_std.hpp:142
#define TEST_EPILOG
Definition: unit.hpp:182
boost::shared_ptr< DataProcessing > DataProcessingPtr
Definition: MSData.hpp:287
string filename1
virtual SpectrumPtr spectrum(size_t index, bool getBinaryData=false) const =0
retrieve a spectrum by index
configuration for write()
Definition: MSDataFile.hpp:52
FileDescription fileDescription
information pertaining to the entire mzML file (i.e. not specific to any part of the data set) is sto...
Definition: MSData.hpp:858
void testSHA1()
Interface for accessing spectra, which may be stored in memory or backed by a data file (RAW...
Definition: MSData.hpp:656
void diff(const string &filename1, const string &filename2)
MS_SHA_1
SHA-1: SHA-1 (Secure Hash Algorithm-1) is a cryptographic hash function designed by the National Secu...
Definition: cv.hpp:2164
void demo()
PWIZ_API_DECL int testReader(const pwiz::msdata::Reader &reader, const std::vector< std::string > &args, bool testAcceptOnly, bool requireUnicodeSupport, const TestPathPredicate &isPathTestable)
A common test harness for vendor readers;.
Run run
a run in mzML should correspond to a single, consecutive and coherent set of scans on an instrument...
Definition: MSData.hpp:882
PWIZ_API_DECL void write(minimxml::XMLWriter &writer, const CV &cv)
const char * getType() const
interface for file readers
Definition: Reader.hpp:37
Format_mzXML
Definition: MSDataFile.hpp:49
boost::shared_ptr< MSData > MSDataPtr
Definition: MSData.hpp:909
int main(int argc, char *argv[])
void hackInMemoryMSData(MSData &msd)
MSData object plus file I/O.
Definition: MSDataFile.hpp:40
configuration struct for diffing MSData types
Definition: Diff.hpp:205
void validateMmgfMzxmlRoundTrip()
SpectrumListPtr spectrumListPtr
all mass spectra and the acquisitions underlying them are described and attached here. Subsidiary data arrays are also both described and attached here.
Definition: MSData.hpp:823
PWIZ_API_DECL void read(std::istream &is, CV &cv)
void validateWriteRead(const MSDataFile::WriteConfig &writeConfig, const DiffConfig diffConfig)
#define TEST_FAILED(x)
Definition: unit.hpp:176
PWIZ_API_DECL void initializeTiny(IdentData &mzid)
std::vector< SoftwarePtr > softwarePtrs
list and descriptions of software used to acquire and/or process the data in this mzML file...
Definition: MSData.hpp:867
Format_MGF
Definition: MSDataFile.hpp:49
void test()
string filenameBase_
BinaryDataEncoder::Config binaryDataEncoderConfig
Definition: MSDataFile.hpp:55
static void write(const MSData &msd, const std::string &filename, const WriteConfig &config=WriteConfig(), const pwiz::util::IterationListenerRegistry *iterationListenerRegistry=0)
static write function for any MSData object; iterationListenerRegistry may be used for progress updat...
virtual void read(const std::string &filename, const std::string &head, std::vector< MSDataPtr > &results, const Config &config=Config()) const
#define TEST_PROLOG(argc, argv)
Definition: unit.hpp:174
virtual void read(const std::string &filename, const std::string &head, MSData &result, int runIndex=0, const Config &config=Config()) const
const char rawHeader_[]
This is the root element of ProteoWizard; it represents the mzML element, defined as: intended to cap...
Definition: MSData.hpp:845
ostream * os_
Simple writeable in-memory implementation of SpectrumList.
Definition: MSData.hpp:712
#define unit_assert(x)
Definition: unit.hpp:85
Definition: cv.hpp:91