00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047 #include <cassert>
00048 #include <fstream>
00049 #include <algorithm>
00050 #include "ncl/nxsmultiformat.h"
00051 #include "ncl/nxsstring.h"
00052
00053 const unsigned MAX_BUFFER_SIZE = 0x80000;
00054
00055
00056
00057
00058
00059
00060
00061 const char * gFormatNames[] = { "nexus",
00062 "dnafasta",
00063 "aafasta",
00064 "rnafasta",
00065 "dnaphylip",
00066 "rnaphylip",
00067 "aaphylip",
00068 "discretephylip",
00069 "dnaphylipinterleaved",
00070 "rnaphylipinterleaved",
00071 "aaphylipinterleaved",
00072 "discretephylipinterleaved",
00073 "dnarelaxedphylip",
00074 "rnarelaxedphylip",
00075 "aarelaxedphylip",
00076 "discreterelaxedphylip",
00077 "dnarelaxedphylipinterleaved",
00078 "rnarelaxedphylipinterleaved",
00079 "aarelaxedphylipinterleaved",
00080 "discreterelaxedphylipinterleaved",
00081 "dnaaln",
00082 "rnaaln",
00083 "aaaln",
00084 "phyliptree",
00085 "relaxedphyliptree",
00086 "nexml",
00087 };
00088 const unsigned gNumFormats = 26;
00089 const unsigned PHYLIP_NMLNGTH = 10;
00090
00091 std::vector<std::string> MultiFormatReader::getFormatNames()
00092 {
00093 std::vector<std::string> v(gNumFormats);
00094 for (unsigned i = 0; i < gNumFormats; ++i)
00095 {
00096 v[i] = std::string(gFormatNames[i]);
00097 }
00098 return v;
00099 }
00100
00101
00102
00103 class FileToCharBuffer
00104 {
00105 char prevChar;
00106 std::istream & inf;
00107 unsigned remaining;
00108 unsigned pos;
00109 public:
00110 unsigned totalSize;
00111 protected:
00112 unsigned lineNumber;
00113 unsigned prevNewlinePos;
00114 public:
00115
00116
00117
00118
00119
00120 FileToCharBuffer(std::istream & instream);
00121
00122
00123
00124
00125
00126 bool refillBuffer(unsigned offset);
00127 char current() const
00128 {
00129 return buffer[pos];
00130 }
00131 bool advance()
00132 {
00133 if (pos + 1 >= inbuffer)
00134 {
00135 if (!refillBuffer(0))
00136 return false;
00137 }
00138 else
00139 ++pos;
00140 const char c = current();
00141 if (c == 13)
00142 {
00143 ++lineNumber;
00144 prevNewlinePos = position();
00145 }
00146 else if (c == 10)
00147 {
00148 if (prev() != 13)
00149 ++lineNumber;
00150 prevNewlinePos = position();
00151 }
00152 return true;
00153 }
00154 bool advance_then_store(char & c)
00155 {
00156 if (!this->advance())
00157 return false;
00158 c = this->current();
00159 return true;
00160 }
00161 bool skip_to_beginning_of_line(char & next);
00162 char prev() const
00163 {
00164 if (pos == 0)
00165 return prevChar;
00166 return buffer[pos - 1];
00167 }
00168 ~FileToCharBuffer()
00169 {
00170 delete [] buffer;
00171 }
00172 unsigned position() const
00173 {
00174 return totalSize + pos - remaining - inbuffer;
00175 }
00176 unsigned line() const
00177 {
00178 return lineNumber;
00179 }
00180 unsigned column() const
00181 {
00182 unsigned p = position();
00183 if (p < prevNewlinePos)
00184 return 0;
00185 return p - prevNewlinePos;
00186 }
00187 char * buffer;
00188 unsigned inbuffer;
00189
00190 };
00191
00192
00193 void MultiFormatReader::ReadFilepath(const char * filepath, const char * formatName)
00194 {
00195 if (!formatName)
00196 return;
00197 DataFormatType f = formatNameToCode(formatName);
00198 if (f == UNSUPPORTED_FORMAT)
00199 {
00200 NxsString m;
00201 m << "Unsupported format: " << formatName;
00202 throw NxsException(m);
00203 }
00204 this->ReadFilepath(filepath, f);
00205 }
00206
00207 void MultiFormatReader::ReadStream(std::istream & inf, const char * formatName)
00208 {
00209 if (!formatName)
00210 return;
00211 DataFormatType f = formatNameToCode(formatName);
00212 if (f == UNSUPPORTED_FORMAT)
00213 {
00214 NxsString m;
00215 m << "Unsupported format: " << formatName;
00216 throw NxsException(m);
00217 }
00218 this->ReadStream(inf, f);
00219 }
00220
00221 FileToCharBuffer::FileToCharBuffer(std::istream & instream)
00222 :prevChar('\n'),
00223 inf(instream),
00224 pos(0),
00225 totalSize(0),
00226 lineNumber(1),
00227 prevNewlinePos(0),
00228 buffer(0L)
00229 {
00230 std::streampos s = inf.tellg();
00231 inf.seekg (0, std::ios::end);
00232 std::streampos e = inf.tellg();
00233 if (e <= s)
00234 {
00235 inbuffer = 0;
00236 remaining = 0;
00237 return;
00238 }
00239 inf.seekg(s);
00240 totalSize = static_cast<unsigned>(e - s);
00241 inbuffer = std::min(MAX_BUFFER_SIZE, totalSize);
00242 remaining = totalSize - inbuffer;
00243 buffer = new char [inbuffer];
00244 inf.read(buffer, inbuffer);
00245 const char c = current();
00246 if (c == 13)
00247 {
00248 ++lineNumber;
00249 prevNewlinePos = position();
00250 }
00251 else if (c == 10)
00252 {
00253 if (prev() != 13)
00254 ++lineNumber;
00255 prevNewlinePos = position();
00256 }
00257 }
00258
00259 bool FileToCharBuffer::refillBuffer(unsigned offset)
00260 {
00261 if (remaining == 0)
00262 return false;
00263 if (offset == 0)
00264 prevChar = buffer[inbuffer-1];
00265 inbuffer = std::min(inbuffer - offset, remaining);
00266 remaining -= inbuffer;
00267 inf.read(buffer + offset, inbuffer);
00268 pos = offset;
00269 return true;
00270 }
00271
00272
00273 MultiFormatReader::DataFormatType MultiFormatReader::formatNameToCode(const std::string &s)
00274 {
00275 std::string l(s);
00276 NxsString::to_lower(l);
00277 int ind = NxsString::index_in_array(l, gFormatNames, gNumFormats);
00278 if (ind < 0)
00279 return UNSUPPORTED_FORMAT;
00280 NCL_ASSERT(ind < UNSUPPORTED_FORMAT);
00281 return MultiFormatReader::DataFormatType(ind);
00282 }
00283
00284
00285
00286
00287
00288
00289
00290 bool MultiFormatReader::readFastaSequences(
00291 FileToCharBuffer & ftcb,
00292 const NxsDiscreteDatatypeMapper &dm,
00293 std::list<std::string> & taxaNames,
00294 std::list<NxsDiscreteStateRow> & matList,
00295 size_t & longest)
00296 {
00297 char * contents = ftcb.buffer;
00298 NCL_ASSERT(contents);
00299 NxsString err;
00300 for (;;)
00301 {
00302 if (ftcb.current() == '>' && ( ftcb.prev() == '\n' || ftcb.prev() == '\r'))
00303 {
00304 std::string n;
00305 if (!ftcb.advance())
00306 break;
00307 for (;;)
00308 {
00309 char c = ftcb.current();
00310 if (c == '\n' || c == '\r')
00311 break;
00312 n.append(1, c);
00313 if (!ftcb.advance())
00314 break;
00315 }
00316 std::string nameStripped = NxsString::strip_surrounding_whitespace(n);
00317 taxaNames.push_back(nameStripped);
00318
00319 matList.push_back(NxsDiscreteStateRow());
00320 if (!ftcb.advance())
00321 break;
00322 NxsDiscreteStateRow & row = *(matList.rbegin());
00323 row.reserve(longest);
00324 for (;;)
00325 {
00326 char c = ftcb.current();
00327 if (c == '>' && (ftcb.prev() == '\n' || ftcb.prev() == '\r'))
00328 break;
00329 if (isgraph(c))
00330 {
00331 int stateCode = dm.GetStateCodeStored(c);
00332 if (stateCode == NXS_INVALID_STATE_CODE)
00333 {
00334 err << "Illegal state code \"" << c << "\" found when reading character " << row.size() << " for taxon " << n;
00335 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00336 }
00337 row.push_back(stateCode);
00338 }
00339 if (!ftcb.advance())
00340 break;
00341 }
00342 longest = std::max(longest, row.size());
00343 }
00344 else if (!ftcb.advance())
00345 break;
00346 }
00347
00348 std::list<NxsDiscreteStateRow>::iterator sIt = matList.begin();
00349 bool allSameLength = true;
00350 for (; sIt != matList.end(); ++sIt)
00351 {
00352 NxsDiscreteStateRow & row = *sIt;
00353 if (row.size() < longest)
00354 {
00355 allSameLength = false;
00356 break;
00357 }
00358 }
00359 return allSameLength;
00360 }
00361
00362 std::string MultiFormatReader::readPhylipName(FileToCharBuffer & ftcb, unsigned i, bool relaxedNames)
00363 {
00364 NxsString err;
00365 std::string n;
00366 if (relaxedNames)
00367 {
00368 do {
00369 n.append(1,ftcb.current());
00370 if (!ftcb.advance())
00371 {
00372 err << "End of file found when reading the name of taxon " << i+1 << ", \"" << n << "\"";
00373 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00374 }
00375 }
00376 while (isgraph(ftcb.current()));
00377 while (!isgraph(ftcb.current()))
00378 {
00379 if (!ftcb.advance())
00380 {
00381 err << "End of file found when expecting the beginning of the data for taxon " << i+1 << ", \"" << n << "\"";
00382 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00383 }
00384 }
00385 }
00386 else
00387 {
00388 std::string ws;
00389 for (unsigned letter = 0; letter < PHYLIP_NMLNGTH; ++letter)
00390 {
00391 char c = ftcb.current();
00392 if (isgraph(c))
00393 {
00394 n.append(ws);
00395 n.append(1,c);
00396 ws.clear();
00397 }
00398 else
00399 ws.append(1, c);
00400 if (!ftcb.advance())
00401 {
00402 err << "End of file found when reading the name for taxon " << i+1 << ", \"" << n << "\"";
00403 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00404 }
00405 }
00406 }
00407 return n;
00408 }
00409
00410 void MultiFormatReader::readPhylipData(
00411 FileToCharBuffer & ftcb,
00412 const NxsDiscreteDatatypeMapper &dm,
00413 std::list<std::string> & taxaNames,
00414 std::list<NxsDiscreteStateRow> & matList,
00415 const unsigned n_taxa,
00416 const unsigned n_char,
00417 bool relaxedNames)
00418 {
00419 NCL_ASSERT(n_taxa > 0 && n_char > 0);
00420 NxsString err;
00421 matList.clear();
00422 matList.assign(n_taxa, NxsDiscreteStateRow(n_char, NXS_INVALID_STATE_CODE));
00423 std::list<NxsDiscreteStateRow>::iterator mIt = matList.begin();
00424 while (!isgraph(ftcb.current()))
00425 {
00426 if (!ftcb.advance())
00427 goto funcExit;
00428 }
00429
00430 for (unsigned i = 0; i < n_taxa; ++i)
00431 {
00432 std::string n = readPhylipName(ftcb, i, relaxedNames);
00433 taxaNames.push_back(n);
00434 NCL_ASSERT(mIt != matList.end());
00435 NxsDiscreteStateRow & row = *mIt++;
00436 for (unsigned j = 0; j < n_char; ++j)
00437 {
00438 bool readChar = false;
00439 for (;;)
00440 {
00441 const char c = ftcb.current();
00442 if (isgraph(c))
00443 {
00444 if (isdigit(c))
00445 {
00446 err << "Number encountered (and ignored) within sequence for taxon " << n;
00447 NexusWarn(err, NxsReader::PROBABLY_INCORRECT_CONTENT_WARNING, ftcb.position(), ftcb.line(), ftcb.column());
00448 err.clear();
00449 }
00450 else
00451 {
00452 const int stateCode = dm.GetStateCodeStored(c);
00453 if (stateCode == NXS_INVALID_STATE_CODE)
00454 {
00455 if (c == '.')
00456 {
00457 if (i == 0)
00458 {
00459 err << "Illegal match character state code \".\" found in the first taxon for character " << j + 1 ;
00460 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00461 }
00462 NxsDiscreteStateRow & firstRow = *(matList.begin());
00463 row[j] = firstRow.at(j);
00464 }
00465 else
00466 {
00467 err << "Illegal state code \"" << c << "\" found when reading site " << j + 1 << " for taxon " << n;
00468 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00469 }
00470 }
00471 else
00472 row[j] = stateCode;
00473 readChar = true;
00474 }
00475 }
00476 if (!ftcb.advance())
00477 goto funcExit;
00478 if (readChar)
00479 break;
00480 }
00481 }
00482 char f = ftcb.current();
00483 while (f != '\r' && f != '\n')
00484 {
00485 if (isgraph(f))
00486 {
00487 err << "Sequence longer than " << n_char << " found for taxon " << n << ". The character \""<< f << "\" was found, and will be ignored. If the file position of this error corresponds to sequences for the next taxon in the matrix, then that is an indication that the sequences for taxon " << n << " are too short.";
00488 NexusWarn(err, NxsReader::PROBABLY_INCORRECT_CONTENT_WARNING, ftcb.position(), ftcb.line(), ftcb.column());
00489 err.clear();
00490 }
00491 if (!ftcb.advance())
00492 goto funcExit;
00493 f = ftcb.current();
00494 }
00495 while (!isgraph(ftcb.current()))
00496 {
00497 if (!ftcb.advance())
00498 goto funcExit;
00499 }
00500 }
00501 funcExit:
00502 if (matList.size() != n_taxa)
00503 {
00504 err << "Unexpected end of file.\nExpecting data for " << n_taxa << " taxa, but only found data for " << matList.size();
00505 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00506 }
00507 const NxsDiscreteStateRow & lastRow = *matList.rbegin();
00508 if (lastRow.size() != n_char)
00509 {
00510 err << "Unexpected end of file.\nExpecting " << n_char << " characters for taxon " << *(taxaNames.rbegin()) << ", but only found " << lastRow.size() << " characters.";
00511 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00512 }
00513 }
00514
00515
00516 void MultiFormatReader::readInterleavedPhylipData(
00517 FileToCharBuffer & ftcb,
00518 const NxsDiscreteDatatypeMapper &dm,
00519 std::list<std::string> & taxaNames,
00520 std::list<NxsDiscreteStateRow> & matList,
00521 const unsigned n_taxa,
00522 const unsigned n_char,
00523 bool relaxedNames)
00524 {
00525 NCL_ASSERT(n_taxa > 0 && n_char > 0);
00526 NxsString err;
00527 matList.clear();
00528 matList.assign(n_taxa, NxsDiscreteStateRow(n_char, NXS_INVALID_STATE_CODE));
00529 std::list<NxsDiscreteStateRow>::iterator mIt = matList.begin();
00530 unsigned startCharIndex = 0;
00531 unsigned endCharIndex = n_char;
00532 while (!isgraph(ftcb.current()))
00533 {
00534 if (!ftcb.advance())
00535 goto funcExit;
00536 }
00537 while (startCharIndex < n_char)
00538 {
00539 for (unsigned i = 0; i < n_taxa; ++i)
00540 {
00541 if (startCharIndex == 0)
00542 {
00543 std::string n = readPhylipName(ftcb, i, relaxedNames);
00544 taxaNames.push_back(n);
00545 }
00546 if (i == 0)
00547 mIt = matList.begin();
00548 NCL_ASSERT(mIt != matList.end());
00549 NxsDiscreteStateRow & row = *mIt++;
00550 unsigned j = startCharIndex;
00551 for (;;)
00552 {
00553 const char c = ftcb.current();
00554 if (isgraph(c))
00555 {
00556 if (j >= endCharIndex)
00557 {
00558 if (i == 0)
00559 {
00560 err << "Too many characters were found for the taxon " << *(taxaNames.begin());
00561 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00562 }
00563 else
00564 {
00565 std::list<std::string>::const_iterator nIt = taxaNames.begin();
00566 for (unsigned q = 0; q < i ; ++q)
00567 ++nIt;
00568 err << "Illegal character \"" << c << "\" found, after all of the data for this interleave page has been read for the taxon " << *nIt;
00569 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00570 }
00571 }
00572 if (isdigit(c))
00573 {
00574 std::list<std::string>::const_iterator nIt = taxaNames.begin();
00575 for (unsigned q = 0; q < i ; ++q)
00576 ++nIt;
00577 err << "Number encountered (and ignored) within sequence for taxon " << *nIt;
00578 NexusWarn(err, NxsReader::PROBABLY_INCORRECT_CONTENT_WARNING, ftcb.position(), ftcb.line(), ftcb.column());
00579 err.clear();
00580 }
00581 else
00582 {
00583 const int stateCode = dm.GetStateCodeStored(c);
00584 if (stateCode == NXS_INVALID_STATE_CODE)
00585 {
00586 if (c == '.')
00587 {
00588 if (i == 0)
00589 {
00590 err << "Illegal match character state code \".\" found in the first taxon for character " << j + 1 ;
00591 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00592 }
00593 NxsDiscreteStateRow & firstRow = *(matList.begin());
00594 row[j] = firstRow.at(j);
00595 }
00596 else
00597 {
00598 std::list<std::string>::const_iterator nIt = taxaNames.begin();
00599 for (unsigned q = 0; q < i ; ++q)
00600 ++nIt;
00601 err << "Illegal state code \"" << c << "\" found when reading site " << j + 1 << " for taxon " << *nIt;
00602 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00603 }
00604 }
00605 else
00606 row[j] = stateCode;
00607 j++;
00608 }
00609 }
00610 else if (c == '\r' || c == '\n')
00611 {
00612 if (i == 0)
00613 endCharIndex = j;
00614 else if (j != endCharIndex)
00615 {
00616 std::list<std::string>::const_iterator nIt = taxaNames.begin();
00617 for (unsigned q = 0; q < i ; ++q)
00618 ++nIt;
00619 err << "Expecting " << endCharIndex - startCharIndex << "characters in this interleave page (based on the number of characters in the first taxon), but only found " << j - startCharIndex << " for taxon " << *nIt;
00620 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00621 }
00622 break;
00623 }
00624 if (!ftcb.advance())
00625 goto funcExit;
00626 }
00627 while (!isgraph(ftcb.current()))
00628 {
00629 if (!ftcb.advance())
00630 goto funcExit;
00631 }
00632 }
00633 startCharIndex = endCharIndex;
00634 endCharIndex = n_char;
00635 }
00636 funcExit:
00637 if (matList.size() != n_taxa)
00638 {
00639 err << "Unexpected end of file.\nExpecting data for " << n_taxa << " taxa, but only found data for " << matList.size();
00640 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00641 }
00642 const NxsDiscreteStateRow & lastRow = *matList.rbegin();
00643 if (lastRow.size() != n_char)
00644 {
00645 err << "Unexpected end of file.\nExpecting " << n_char << " characters for taxon " << *(taxaNames.rbegin()) << ", but only found " << lastRow.size() << " characters.";
00646 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00647 }
00648 }
00649
00650 bool FileToCharBuffer::skip_to_beginning_of_line(char & next)
00651 {
00652 next = this->current();
00653 for (;;)
00654 {
00655 const char c = next;
00656 if (!this->advance_then_store(next))
00657 return false;
00658 if (c == '\n')
00659 return true;
00660 if (c == '\r')
00661 {
00662 if (next == '\n' && (!this->advance_then_store(next)))
00663 return false;
00664 return true;
00665 }
00666 }
00667 }
00668
00669 bool MultiFormatReader::readAlnData(
00670 FileToCharBuffer & ftcb,
00671 const NxsDiscreteDatatypeMapper &dm,
00672 std::list<std::string> & taxaNames,
00673 std::list<NxsDiscreteStateRow> & matList)
00674 {
00675 taxaNames.clear();
00676 char * contents = ftcb.buffer;
00677 NCL_ASSERT(contents);
00678 NxsString err;
00679 char c;
00680 if (!ftcb.current())
00681 throw NxsException("Could not read from file", ftcb.position(), ftcb.line(), ftcb.column());
00682
00683 c = ftcb.current();
00684 unsigned index = 0;
00685 const char * firstWord = "CLUSTAL";
00686 std::string found;
00687 const unsigned lenFirstWord = strlen(firstWord);
00688 while (index < lenFirstWord)
00689 {
00690 found.append(1, c);
00691 if (toupper(c) != firstWord[index] || !ftcb.advance())
00692 {
00693 err << "Expecting file to start \"CLUSTAL\" found \"" << found << "\"";
00694 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00695 }
00696 ++index;
00697 c = ftcb.current();
00698 }
00699 do {
00700 if (!ftcb.skip_to_beginning_of_line(c))
00701 throw NxsException("Expecting multi-line file",ftcb.position(), ftcb.line(), ftcb.column());
00702 } while (!isgraph(c));
00703 bool readingFirstBlock = true;
00704 for (;;)
00705 {
00706
00707 while (!isgraph(c))
00708 {
00709 if (!ftcb.skip_to_beginning_of_line(c))
00710 {
00711 if (taxaNames.empty())
00712 throw NxsException("Sequences after clustal header", ftcb.position(), ftcb.line(), ftcb.column());
00713 goto funcExit;
00714 }
00715 }
00716 unsigned curr_tax_ind = 0;
00717 std::list<std::string>::const_iterator taxNameIt;
00718 std::list<NxsDiscreteStateRow>::iterator matRowIt;
00719 if (!readingFirstBlock)
00720 {
00721 taxNameIt = taxaNames.begin();
00722 matRowIt = matList.begin();
00723 }
00724 NxsDiscreteStateRow * row = NULL;
00725
00726 for (;isgraph(c);)
00727 {
00728 std::string n;
00729 for (;;)
00730 {
00731 n.append(1, c);
00732 if (!ftcb.advance())
00733 break;
00734 c = ftcb.current();
00735 if (!isgraph(c))
00736 break;
00737 }
00738 if (readingFirstBlock)
00739 {
00740 taxaNames.push_back(n);
00741 matList.push_back(NxsDiscreteStateRow());
00742 row = &(*(matList.rbegin()));
00743 }
00744 else if (curr_tax_ind > taxaNames.size())
00745 {
00746 err << "Expecting a line beginning with whitespace (or a blank line), but found \"" << n << "\"";
00747 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00748 }
00749 else
00750 {
00751 std::string prev_name = *taxNameIt++;
00752 if (!NxsString::case_insensitive_equals(prev_name.c_str(), n.c_str()))
00753 {
00754 err << "Expecting data for taxon # " << (1 + curr_tax_ind) << " \"" << prev_name << "\" but got \"" << n << "\"";
00755 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00756 }
00757 row = &(*matRowIt++);
00758 }
00759
00760
00761 while (ftcb.advance_then_store(c))
00762 {
00763 if (isgraph(c))
00764 break;
00765 }
00766 if (!isgraph(c))
00767 {
00768 err << "Unexpected end-of-file after taxon name \"" << n << "\"";
00769 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00770 }
00771
00772 bool eof = false;
00773 bool eoseq = false;
00774 for (;!eoseq;)
00775 {
00776 if (isgraph(c))
00777 {
00778 if (isdigit(c))
00779 {
00780 if (!ftcb.skip_to_beginning_of_line(c))
00781 {
00782 if (!readingFirstBlock && (curr_tax_ind + 1) != taxaNames.size())
00783 {
00784 err << "Unexpected End of file. Expecting data for " << taxaNames.size() << " sequences";
00785 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00786 }
00787 goto funcExit;
00788 }
00789 break;
00790 }
00791 else
00792 {
00793 int stateCode = dm.GetStateCodeStored(c);
00794 if (stateCode == NXS_INVALID_STATE_CODE)
00795 {
00796 err << "Illegal state code \"" << c << "\" found when reading character " << row->size() << " for taxon " << n;
00797 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00798 }
00799 row->push_back(stateCode);
00800 eof = !ftcb.advance_then_store(c);
00801 }
00802 }
00803 if ((!eof) && (!isgraph(c)))
00804 {
00805 if (c == '\n')
00806 {
00807 eof = !ftcb.advance_then_store(c);
00808 eoseq = true;
00809 }
00810 else if (c == '\r')
00811 {
00812 eof = !ftcb.advance_then_store(c);
00813 if (!eof && c == '\n')
00814 eof = !ftcb.advance_then_store(c);
00815 eoseq = true;
00816 }
00817 else
00818 eof = !ftcb.advance_then_store(c);
00819 }
00820 if (eof)
00821 {
00822 if (!readingFirstBlock && (curr_tax_ind + 1) != taxaNames.size())
00823 {
00824 err << "Unexpected End of file. Expecting data for " << taxaNames.size() << " sequences";
00825 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00826 }
00827 goto funcExit;
00828 }
00829 }
00830 if (isgraph(c))
00831 curr_tax_ind++;
00832 else
00833 {
00834 if (!readingFirstBlock && (1 + curr_tax_ind) != taxaNames.size())
00835 {
00836 err << "Unexpected line beginning with whitespace. Expecting data for " << taxaNames.size() << " sequences";
00837 throw NxsException(err, ftcb.position(), ftcb.line(), ftcb.column());
00838 }
00839 curr_tax_ind = 0;
00840 readingFirstBlock = false;
00841 }
00842 }
00843 }
00844
00845 funcExit:
00846
00847 std::list<NxsDiscreteStateRow>::iterator sIt = matList.begin();
00848 long longest = -1;
00849 for (; sIt != matList.end(); ++sIt)
00850 {
00851 NxsDiscreteStateRow & row = *sIt;
00852 if (longest == -1)
00853 longest = (long) row.size();
00854 else if (longest != (long) row.size())
00855 return false;
00856 }
00857 return true;
00858 }
00859
00860 void MultiFormatReader::addTaxaNames(const std::list<std::string> & taxaNames, NxsTaxaBlockAPI * taxa)
00861 {
00862 NCL_ASSERT(taxa);
00863 std::list<std::string>::const_iterator nIt = taxaNames.begin();
00864
00865 std::vector<NxsNameToNameTrans> nameTrans;
00866 bool nameTransNeeded = false;
00867 NxsString t;
00868
00869 for (; nIt != taxaNames.end(); ++nIt)
00870 {
00871 std::string name = *nIt;
00872 NxsNameToNameTrans trans(name, name);
00873 for (unsigned i = 1; ; ++i)
00874 {
00875 try {
00876 taxa->AddTaxonLabel(name);
00877 break;
00878 }
00879 catch (DuplicatedLabelNxsException & x)
00880 {
00881 if (!this->conversionOutputRecord.addNumbersToDisambiguateNames)
00882 throw;
00883 nameTransNeeded = true;
00884 t.assign(*nIt);
00885 t << i;
00886 trans.second = t;
00887 name = t;
00888 }
00889 }
00890 if (this->conversionOutputRecord.addNumbersToDisambiguateNames)
00891 nameTrans.push_back(trans);
00892 }
00893
00894
00895
00896 if (nameTransNeeded)
00897 this->conversionOutputRecord.writeNameTranslation(nameTrans, taxa);
00898 }
00899
00900 void MultiFormatReader::moveDataToMatrix(std::list<NxsDiscreteStateRow> & matList, NxsDiscreteStateMatrix &mat)
00901 {
00902 mat.clear();
00903 mat.resize(matList.size());
00904 NxsDiscreteStateMatrix::iterator dIt = mat.begin();
00905 std::list<NxsDiscreteStateRow>::iterator sIt = matList.begin();
00906 for (; sIt != matList.end(); ++sIt, ++dIt)
00907 {
00908 NxsDiscreteStateRow & source = *sIt;
00909 NxsDiscreteStateRow & dest = *dIt;
00910 dest.swap(source);
00911 }
00912 }
00913
00914 void MultiFormatReader::moveDataToDataBlock(const std::list<std::string> & taxaNames, std::list<NxsDiscreteStateRow> & matList, const unsigned nchar, NxsDataBlock * dataB)
00915 {
00916 NCL_ASSERT(dataB);
00917 NxsString d;
00918 d << "Dimensions ntax = " << matList.size() << " nchar = " << nchar << " ; ";
00919 std::istringstream fakeDimStream(d);
00920 NxsToken fakeDimToken(fakeDimStream);
00921 NxsString newTaxLabel("NewTaxa");
00922 NxsString ntaxLabel("NTax");
00923 NxsString ncharLabel("NChar");
00924 dataB->HandleDimensions(fakeDimToken, newTaxLabel, ntaxLabel, ncharLabel);
00925
00926 NCL_ASSERT(dataB->taxa);
00927 addTaxaNames(taxaNames, dataB->taxa);
00928
00929 moveDataToMatrix(matList, dataB->discreteMatrix);
00930 }
00931
00932 void MultiFormatReader::moveDataToUnalignedBlock(const std::list<std::string> & taxaNames, std::list<NxsDiscreteStateRow> & matList, NxsUnalignedBlock * uB)
00933 {
00934 NCL_ASSERT(uB);
00935 NxsString d;
00936 d << "Dimensions NewTaxa ntax = " << matList.size() << " ; ";
00937 std::istringstream fakeDimStream(d);
00938 NxsToken fakeDimToken(fakeDimStream);
00939 uB->HandleDimensions(fakeDimToken);
00940
00941 NCL_ASSERT(uB->taxa);
00942 addTaxaNames(taxaNames, uB->taxa);
00943
00944 moveDataToMatrix(matList, uB->uMatrix);
00945 }
00946
00947 void MultiFormatReader::readFastaFile(std::istream & inf, NxsCharactersBlock::DataTypesEnum dt)
00948 {
00949 NxsString blockID("DATA");
00950 NxsBlock *nb = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
00951 NCL_ASSERT(nb);
00952 if (!nb)
00953 return;
00954 nb->SetNexus(this);
00955
00956 NxsDataBlock * dataB = static_cast<NxsDataBlock *>(nb);
00957 FileToCharBuffer ftcb(inf);
00958 if (ftcb.buffer)
00959 {
00960 dataB->Reset();
00961 dataB->datatype = dt;
00962 dataB->ResetSymbols();
00963 dataB->gap = '-';
00964 NxsPartition dtParts;
00965 std::vector<NxsCharactersBlock::DataTypesEnum> dtv;
00966 dataB->CreateDatatypeMapperObjects(dtParts, dtv);
00967
00968 const NxsDiscreteDatatypeMapper * dm = dataB->GetDatatypeMapperForChar(0);
00969
00970 std::list<std::string> taxaNames;
00971 std::list<NxsDiscreteStateRow> matList;
00972 size_t longest = 0;
00973 bool aligned = true;
00974 try {
00975 aligned = readFastaSequences(ftcb, *dm, taxaNames, matList, longest);
00976 }
00977 catch (...)
00978 {
00979 cloneFactory.BlockError(dataB);
00980 throw;
00981 }
00982
00983 if (aligned)
00984 {
00985 moveDataToDataBlock(taxaNames, matList, longest, dataB);
00986 BlockReadHook(blockID, dataB);
00987 }
00988 else
00989 {
00990 cloneFactory.BlockError(dataB);
00991 blockID.assign("UNALIGNED");
00992 NxsBlock * nub = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
00993 if (!nub)
00994 {
00995 NCL_ASSERT(nub);
00996 return;
00997 }
00998 nub->SetNexus(this);
00999
01000 NxsUnalignedBlock * unalignedB = static_cast<NxsUnalignedBlock *>(nub);
01001 unalignedB->Reset();
01002 unalignedB->datatype = dt;
01003 unalignedB->ResetSymbols();
01004 unalignedB->ResetDatatypeMapper();
01005 moveDataToUnalignedBlock(taxaNames, matList, unalignedB);
01006 BlockReadHook(blockID, unalignedB);
01007 }
01008 }
01009 else
01010 {
01011 cloneFactory.BlockError(dataB);
01012 NxsString err;
01013 err << "No Data read -- file appears to be empty";
01014 this->NexusError(err, 0, -1, -1);
01015 }
01016 }
01017
01018 void MultiFormatReader::ReadFilepath(const char * filepath, DataFormatType format)
01019 {
01020 if (format == NEXUS_FORMAT)
01021 {
01022 NxsReader::ReadFilepath(filepath);
01023 }
01024 else
01025 {
01026 std::ifstream inf;
01027 try{
01028 inf.open(filepath, std::ios::binary);
01029 if (!inf.good())
01030 {
01031 NxsString err;
01032 err << "Could not open the file \"" << filepath <<"\"";
01033 this->NexusError(err, 0, -1, -1);
01034 }
01035 }
01036 catch (...)
01037 {
01038 NxsString err;
01039 err << '\"' << filepath <<"\" does not refer to a valid file." ;
01040 this->NexusError(err, 0, -1, -1);
01041 }
01042 this->ReadStream(inf, format, filepath);
01043 }
01044 }
01045
01046 void MultiFormatReader::ReadStream(std::istream & inf, DataFormatType format, const char * filepath)
01047 {
01048 if (format == NEXUS_FORMAT)
01049 {
01050 NxsReader::ReadFilestream(inf);
01051 }
01052 else
01053 {
01054 if (format == FASTA_DNA_FORMAT)
01055 readFastaFile(inf, NxsCharactersBlock::dna);
01056 else if (format == FASTA_RNA_FORMAT)
01057 readFastaFile(inf, NxsCharactersBlock::rna);
01058 else if (format == FASTA_AA_FORMAT)
01059 readFastaFile(inf, NxsCharactersBlock::protein);
01060 else if (format == PHYLIP_DNA_FORMAT)
01061 readPhylipFile(inf, NxsCharactersBlock::dna, false, false);
01062 else if (format == PHYLIP_RNA_FORMAT)
01063 readPhylipFile(inf, NxsCharactersBlock::rna, false, false);
01064 else if (format == PHYLIP_AA_FORMAT)
01065 readPhylipFile(inf, NxsCharactersBlock::protein, false, false);
01066 else if (format == PHYLIP_DISC_FORMAT)
01067 readPhylipFile(inf, NxsCharactersBlock::standard, false, false);
01068 else if (format == INTERLEAVED_PHYLIP_DNA_FORMAT)
01069 readPhylipFile(inf, NxsCharactersBlock::dna, false, true);
01070 else if (format == INTERLEAVED_PHYLIP_RNA_FORMAT)
01071 readPhylipFile(inf, NxsCharactersBlock::rna, false, true);
01072 else if (format == INTERLEAVED_PHYLIP_AA_FORMAT)
01073 readPhylipFile(inf, NxsCharactersBlock::protein, false, true);
01074 else if (format == INTERLEAVED_PHYLIP_DISC_FORMAT)
01075 readPhylipFile(inf, NxsCharactersBlock::standard, false, true);
01076 else if (format == RELAXED_PHYLIP_DNA_FORMAT)
01077 readPhylipFile(inf, NxsCharactersBlock::dna, true, false);
01078 else if (format == RELAXED_PHYLIP_RNA_FORMAT)
01079 readPhylipFile(inf, NxsCharactersBlock::rna, true, false);
01080 else if (format == RELAXED_PHYLIP_AA_FORMAT)
01081 readPhylipFile(inf, NxsCharactersBlock::protein, true, false);
01082 else if (format == RELAXED_PHYLIP_DISC_FORMAT)
01083 readPhylipFile(inf, NxsCharactersBlock::standard, true, false);
01084 else if (format == INTERLEAVED_RELAXED_PHYLIP_DNA_FORMAT)
01085 readPhylipFile(inf, NxsCharactersBlock::dna, true, true);
01086 else if (format == INTERLEAVED_RELAXED_PHYLIP_RNA_FORMAT)
01087 readPhylipFile(inf, NxsCharactersBlock::rna, true, true);
01088 else if (format == INTERLEAVED_RELAXED_PHYLIP_AA_FORMAT)
01089 readPhylipFile(inf, NxsCharactersBlock::protein, true, true);
01090 else if (format == INTERLEAVED_RELAXED_PHYLIP_DISC_FORMAT)
01091 readPhylipFile(inf, NxsCharactersBlock::standard, true, true);
01092 else if (format == ALN_DNA_FORMAT)
01093 readAlnFile(inf, NxsCharactersBlock::dna);
01094 else if (format == ALN_RNA_FORMAT)
01095 readAlnFile(inf, NxsCharactersBlock::rna);
01096 else if (format == ALN_AA_FORMAT)
01097 readAlnFile(inf, NxsCharactersBlock::protein);
01098 else if (format == RELAXED_PHYLIP_TREE_FORMAT)
01099 readPhylipTreeFile(inf, true);
01100 else if (format == PHYLIP_TREE_FORMAT)
01101 readPhylipTreeFile(inf, false);
01102 else
01103 {
01104 NxsString m;
01105 if (filepath)
01106 m << "The file " << filepath << " is not in a supported format.";
01107 else
01108 m << "Unsupported format.";
01109 NexusError(m, 0, -1, -1);
01110 return;
01111 }
01112 PostExecuteHook();
01113 }
01114 }
01115
01116
01117
01118
01119 unsigned MultiFormatReader::readPhylipHeader(std::istream & inf, unsigned & ntax, unsigned & nchar)
01120 {
01121 if (inf.good())
01122 inf >> ntax;
01123 if (inf.good())
01124 inf >> nchar;
01125 if (!inf.good() || ntax == 0 || nchar == 0)
01126 {
01127 NxsString err("Expecting the file to start with the number of taxa then the number of characters.");
01128 throw NxsException(err, 0, -1, -1);
01129 }
01130 return (unsigned) inf.tellg();
01131 }
01132
01133 void MultiFormatReader::readPhylipTreeFile(std::istream & inf, bool relaxedNames)
01134 {
01135 NxsString blockID("TREES");
01136 NxsBlock *nb = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
01137 NCL_ASSERT(nb);
01138 if (!nb)
01139 return;
01140 nb->SetNexus(this);
01141
01142
01143
01144
01145
01146 NxsTreesBlock * treesB = static_cast<NxsTreesBlock *>(nb);
01147 NxsString err;
01148 try {
01149 treesB->Reset();
01150 NxsToken inTokens(inf);
01151 treesB->ReadPhylipTreeFile(inTokens);
01152 if (!relaxedNames)
01153 {
01154 const NxsTaxaBlockAPI * taxa = treesB->GetTaxaBlockPtr(0L);
01155 if (!taxa)
01156 {
01157 err << "No taxa found in tree description (which probably means that no tree was found).";
01158 throw NxsException(err, inTokens);
01159 }
01160 const std::vector<std::string> l = taxa->GetAllLabels();
01161 for (std::vector<std::string>::const_iterator lIt = l.begin(); lIt != l.end(); ++lIt)
01162 {
01163 if (lIt->length() > PHYLIP_NMLNGTH)
01164 {
01165 err << "The taxon label " << *lIt << " has more than the allowed number of charcters (" << PHYLIP_NMLNGTH << ')';
01166 throw NxsException(err);
01167 }
01168 }
01169 }
01170 BlockReadHook(blockID, treesB);
01171 }
01172 catch (...)
01173 {
01174 cloneFactory.BlockError(nb);
01175 throw;
01176 }
01177 }
01178
01179
01180
01181
01182 void MultiFormatReader::readAlnFile(std::istream & inf, NxsCharactersBlock::DataTypesEnum dt)
01183 {
01184 NxsString blockID("DATA");
01185 NxsBlock *nb = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
01186 NCL_ASSERT(nb);
01187 if (!nb)
01188 return;
01189 nb->SetNexus(this);
01190
01191
01192
01193
01194 NxsDataBlock * dataB = static_cast<NxsDataBlock *>(nb);
01195
01196 try {
01197 dataB->Reset();
01198 dataB->datatype = dt;
01199 dataB->ResetSymbols();
01200 dataB->gap = '-';
01201 NxsPartition dtParts;
01202 std::vector<NxsCharactersBlock::DataTypesEnum> dtv;
01203 dataB->CreateDatatypeMapperObjects(dtParts, dtv);
01204
01205 const NxsDiscreteDatatypeMapper * dm = dataB->GetDatatypeMapperForChar(0);
01206 NCL_ASSERT(dm);
01207 FileToCharBuffer ftcb(inf);
01208 if (ftcb.buffer)
01209 {
01210 std::list<std::string> taxaNames;
01211 std::list<NxsDiscreteStateRow> matList;
01212 if (!readAlnData(ftcb, *dm, taxaNames, matList))
01213 throw NxsException("Expecting the same number of characters for all sequences in the ALN file");
01214 const unsigned nchar = matList.begin()->size();
01215 moveDataToDataBlock(taxaNames, matList, nchar, dataB);
01216 BlockReadHook(blockID, dataB);
01217 }
01218 }
01219 catch (...)
01220 {
01221 cloneFactory.BlockError(nb);
01222 throw;
01223 }
01224 }
01225
01226
01227
01228
01229 void MultiFormatReader::readPhylipFile(std::istream & inf, NxsCharactersBlock::DataTypesEnum dt, bool relaxedNames, bool interleaved)
01230 {
01231 NxsString blockID("DATA");
01232 NxsBlock *nb = cloneFactory.GetBlockReaderForID(blockID, this, NULL);
01233 NCL_ASSERT(nb);
01234 if (!nb)
01235 return;
01236 nb->SetNexus(this);
01237
01238
01239
01240
01241 NxsDataBlock * dataB = static_cast<NxsDataBlock *>(nb);
01242
01243 try {
01244 dataB->Reset();
01245 dataB->datatype = dt;
01246 dataB->ResetSymbols();
01247 dataB->gap = '-';
01248 NxsPartition dtParts;
01249 std::vector<NxsCharactersBlock::DataTypesEnum> dtv;
01250 dataB->CreateDatatypeMapperObjects(dtParts, dtv);
01251
01252 const NxsDiscreteDatatypeMapper * dm = dataB->GetDatatypeMapperForChar(0);
01253 NCL_ASSERT(dm);
01254 unsigned ntax, nchar;
01255 unsigned headerLen = readPhylipHeader(inf, ntax, nchar);
01256 FileToCharBuffer ftcb(inf);
01257 ftcb.totalSize += headerLen;
01258 if (ftcb.buffer)
01259 {
01260 std::list<std::string> taxaNames;
01261 std::list<NxsDiscreteStateRow> matList;
01262 if (interleaved)
01263 readInterleavedPhylipData(ftcb, *dm, taxaNames, matList, ntax, nchar, relaxedNames);
01264 else
01265 readPhylipData(ftcb, *dm, taxaNames, matList, ntax, nchar, relaxedNames);
01266 moveDataToDataBlock(taxaNames, matList, nchar, dataB);
01267 BlockReadHook(blockID, dataB);
01268 }
01269 }
01270 catch (...)
01271 {
01272 cloneFactory.BlockError(nb);
01273 throw;
01274 }
01275 }