/* Some notes: * * For the most part, most scoring programs tend to agree on computed SP * scores. Column scores, however, are another matter. We describe the * method implemented in this program below. * * We process our alignment by replacing every letter with an integer * indicating the index of that letter in the corresponding sequence; * e.g., A,B,-,-,-,C,D becomes 1,2,-,-,-,3,4. Then, we replace each * gap with 0 to indicate no letter at that position. In our example, * we have 1,2,0,0,0,3,4. A column, then, is represented as an * ordered tuple of integers. * * In this definition, columns match exactly whenever they contain the * same letters from the same sequences. Extra or missing letters from * a column will result in that column being counted incorrect. Gaps * are ignored -- that is, we do not distinguish between a gap between * positions 2 and 3, and a gap between positions 5 and 6; they do * not constitute part of the column description. Note that we require * at least two residues be present in a reference column before it is * used. */ #include #include #include #include #include #include using namespace std; typedef vector > alignment_t; /* Function prototypes */ void Error (const string &error_msg); alignment_t LoadAlignment (const string &filename); void CheckAlignmentPair (const alignment_t &aln1, const alignment_t &aln2); set > ComputeColumns (const alignment_t &aln, bool require_caps); set > ComputePairwiseMatches (const alignment_t &aln, bool require_caps); set > ComputeIntersection (const set > &set1, const set > &set2); /* Main program */ int main (int argc, char **argv){ if (argc != 3){ cerr << "Usage: score_mfa REFERENCE_MFA TEST_MFA" << endl; exit(1); } alignment_t reference_alignment = LoadAlignment (argv[1]); alignment_t test_alignment = LoadAlignment (argv[2]); CheckAlignmentPair (reference_alignment, test_alignment); set > ref_pairwise_matches = ComputePairwiseMatches (reference_alignment, true); set > test_pairwise_matches = ComputePairwiseMatches (test_alignment, false); set > pairwise_matches_intersection = ComputeIntersection (ref_pairwise_matches, test_pairwise_matches); set > ref_columns = ComputeColumns (reference_alignment, true); set > test_columns = ComputeColumns (test_alignment, false); set > columns_intersection = ComputeIntersection (ref_columns, test_columns); cout << setiosflags (ios::fixed | ios::showpoint) << setprecision(10) << "Q=" << (ref_pairwise_matches.size() == 0 ? 1.0 : (double) pairwise_matches_intersection.size() / ref_pairwise_matches.size()) << ";Modeler=" << (test_pairwise_matches.size() == 0 ? 1.0 : (double) pairwise_matches_intersection.size() / test_pairwise_matches.size()) << ";TC=" << (ref_columns.size() == 0 ? 1.0 : (double) columns_intersection.size() / ref_columns.size()) << endl; } /* Report error */ void Error (const string &error_msg){ cerr << "ERROR: " << error_msg << endl; exit (1); } /* Load alignment from MFA file */ alignment_t LoadAlignment (const string &filename){ alignment_t ret; ifstream file (filename.c_str()); if (file.fail()) Error ("Could not open alignment file for reading."); string s; while (getline (file, s)){ if (s.length() == 0) continue; if (s[0] == '>') ret.push_back (make_pair(s.substr(1), "@")); else { for (int i = 0; i < (int) s.length(); i++) if (s[i] == '.') s[i] = '-'; ret.back().second += s; } } sort (ret.begin(), ret.end()); return ret; } /* Perform sanity checks on alignment pair */ void CheckAlignmentPair (const alignment_t &aln1, const alignment_t &aln2){ if (aln1.size() < 2) Error ("Alignment must contain at least two sequences."); if (aln2.size() < 2) Error ("Alignment must contain at least two sequences."); if (aln1.size() != aln2.size()) Error ("Alignments have different numbers of sequences."); for (int i = 0; i < (int) aln1.size(); i++){ if (aln1[i].first != aln2[i].first) Error ("Alignments contain different sequence names."); int pos1 = 1, pos2 = 1; const string &x = aln1[i].second; const string &y = aln2[i].second; while (pos1 < x.length() && pos2 < y.length()){ if (x[pos1] == '-') pos1++; else if (y[pos2] == '-') pos2++; else if (toupper (x[pos1]) == toupper (y[pos2])){ pos1++; pos2++; } else { Error ("Sequences in the alignments differ."); } } while (pos1 < x.length()) if (x[pos1++] != '-') Error ("Sequences in the alignments differ."); while (pos2 < y.length()) if (y[pos2++] != '-') Error ("Sequences in the alignments differ."); if (i > 0 && aln1[i].second.length() != aln1[i-1].second.length()) Error ("Rows have different length in alignment."); if (i > 0 && aln2[i].second.length() != aln2[i-1].second.length()) Error ("Rows have different length in alignment."); } } /* Compute set of all pairwise matches */ set > ComputePairwiseMatches (const alignment_t &aln, bool require_caps){ set > ret; vector match (4); int length = aln[0].second.length(); for (int i = 0; i < (int) aln.size()-1; i++){ for (int j = i+1; j < (int) aln.size(); j++){ match[0] = i; match[1] = j; int pos1 = 0, pos2 = 0; for (int k = 1; k < length; k++){ if (aln[i].second[k] != '-') pos1++; if (aln[j].second[k] != '-') pos2++; if (aln[i].second[k] != '-' && aln[j].second[k] != '-'){ if (require_caps && (!isupper(aln[i].second[k]) || !isupper(aln[j].second[k]))) continue; match[2] = pos1; match[3] = pos2; ret.insert (match); } } } } return ret; } /* Compute set of columns */ set > ComputeColumns (const alignment_t &aln, bool require_caps){ set > ret; int length = aln[0].second.length(); vector pos (aln.size(), 0); for (int k = 1; k < length; k++){ vector column; bool skip_column = false; int num_letters = 0; for (int i = 0; i < (int) aln.size(); i++){ if (aln[i].second[k] != '-'){ pos[i]++; if (require_caps && !isupper(aln[i].second[k])) skip_column = true; column.push_back (pos[i]); num_letters++; } else { column.push_back (-1); } } if (!skip_column && num_letters > 1) ret.insert (column); } return ret; } /* Compute intersection */ set > ComputeIntersection (const set > &set1, const set > &set2){ set > ret; set_intersection (set1.begin(), set1.end(), set2.begin(), set2.end(), insert_iterator > >(ret, ret.begin())); return ret; }