打印得分最高的两个序列的所有对齐
本文关键字:两个 对齐 打印 | 更新日期: 2023-09-27 18:14:27
序列比对是一个相当标准的问题,在生物信息学领域的DNA或蛋白质比对中得到了应用。我最近遇到了这个问题的另一个版本。
给定两个输入字符串(假设字符串仅由A,C,G,T组成),问题基本上是根据以下矩阵找到最大对齐分数——
A C G T -
A 5 -1 -2 -1 -3
C -1 5 -3 -2 -4
G -2 -3 5 -2 -2
T -1 -2 -2 5 -1
- -3 -4 -2 -1 Not Allowed
所以,如果A与-对齐,我们给对齐分数加-3;如果G与T对齐,我们给得分加-2;如果C与C对齐,我们加5。因此,对于输入字符串AGTGATG和GTTAG,最大对齐分数为14,其中一个最大得分的对齐可以表示为
AGTGATG
-GTTA-G
对齐评分计算如下:A- = -3, GG = 5, TT = 5, GT= -2, AA = 5, T-= -1, GG = 5。将它们加起来,-3+5+5-2+5-1+5 = 14,这是这对字符串的最大可能对齐分数。
我能够使用动态规划对其进行编码,并获得对齐分数矩阵,但我在打印具有最大对齐分数的两个字符串的所有可能对齐时面临问题。我试着像我们在LCS中做的那样回溯,但是没有成功。我附上我的代码。
static Dictionary<string, int> dict;
static void Main(string[] args)
{
//This has been assumed that the strings contain only A,C,G,T and -(?)..caps
Console.WriteLine("Enter first string : ");
string a = Console.ReadLine();
a = "-" + a;
Console.WriteLine("Enter second string : ");
string b = Console.ReadLine();
b = "-" + b;
int[,] SQ = new int[a.Length, b.Length];
#region Create Dictionary
dict = new Dictionary<string, int>();
dict.Add("AA", 5);
dict.Add("AC", -1);
dict.Add("AG", -2);
dict.Add("AT", -1);
dict.Add("A-", -3);
dict.Add("CA", -1);
dict.Add("CC", 5);
dict.Add("CG", -3);
dict.Add("CT", -2);
dict.Add("C-", -4);
dict.Add("GA", -2);
dict.Add("GC", -3);
dict.Add("GG", 5);
dict.Add("GT", -2);
dict.Add("G-", -2);
dict.Add("TA", -1);
dict.Add("TC", -2);
dict.Add("TG", -2);
dict.Add("TT", 5);
dict.Add("T-", -1);
dict.Add("-A", -3);
dict.Add("-C", -4);
dict.Add("-G", -2);
dict.Add("-T", -1);
dict.Add("--", 0);
#endregion Create Dictionary
for (int i = 0; i < a.Length; i++)
{
for (int j = 0; j < b.Length; j++)
{
int key = 0, key1 = 0, key2 = 0;
dict.TryGetValue(a[i].ToString() + b[j].ToString(), out key);
dict.TryGetValue("-" + b[j].ToString(), out key1);
dict.TryGetValue(a[i].ToString() + "-", out key2);
if (i == 0)
SQ[i, j] = key1;
else if (j == 0)
SQ[i, j] = key2;
else
SQ[i, j] = Math.Max(SQ[i - 1, j - 1] + key, Math.Max(SQ[i - 1, j] + key1, SQ[i, j - 1] + key2));
}
}
for (int i = 0; i < a.Length; i++)
{
for (int j = 0; j < b.Length; j++)
{
Console.Write(SQ[i, j] + " ");
}
Console.WriteLine();
}
Console.WriteLine("Alignment Score : " + SQ[a.Length - 1, b.Length - 1]);
printAllAlignmentsWithHighestAlignmentScore();
Console.Read();
}
有人能帮我实现printAllAlignmentsWithHighestAlignmentScore()函数吗?
最后,我有了工作代码来完成我想要做的事情。这个问题实际上是Needleman-Wunsch算法的一个细微变化
代码:
class Program
{
static Dictionary<string, int> dict;
static void printAllAlignments(int[,] SQ, string a, string b, int p, int q, string str1, string str2){
if (p == 0 || q == 0){
while (p == 0 && q != 0){
str1 = "-" + str1;
str2 = b[--q]+str2;
}
while (q == 0 && p != 0){
str1 = a[--p]+str1;
str2 = '-' + str2;
}
Console.WriteLine("'n"+str1+"'n"+str2+"'n");
return;
}
if (SQ[p, q] == (dict[a[p - 1] + b[q - 1].ToString()] + SQ[p - 1, q - 1]))
printAllAlignments(SQ, a, b, p - 1, q - 1, a[p-1]+str1, b[q-1]+str2);
if (SQ[p, q] == (dict[a[p - 1]+ "-"] + SQ[p - 1, q]))
printAllAlignments(SQ, a, b, p - 1, q, a[p-1]+str1, "-"+str2);
if (SQ[p, q] == (dict["-" + b[q-1]] + SQ[p, q - 1]))
printAllAlignments(SQ, a, b, p, q - 1, "-"+str1, b[q-1]+str2);
}
static void Main(string[] args)
{
//This has been assumed that the strings contain only A,C,G,T and -(?)..caps
Console.WriteLine("Enter first string : ");
string a = Console.ReadLine();
Console.WriteLine("Enter second string : ");
string b = Console.ReadLine();
int[,] SQ = new int[a.Length+1, b.Length+1];
#region Create Dictionary
dict = new Dictionary<string, int>();
dict.Add("AA", 5);
dict.Add("AC", -1);
dict.Add("AG", -2);
dict.Add("AT", -1);
dict.Add("A-", -3);
dict.Add("CA", -1);
dict.Add("CC", 5);
dict.Add("CG", -3);
dict.Add("CT", -2);
dict.Add("C-", -4);
dict.Add("GA", -2);
dict.Add("GC", -3);
dict.Add("GG", 5);
dict.Add("GT", -2);
dict.Add("G-", -2);
dict.Add("TA", -1);
dict.Add("TC", -2);
dict.Add("TG", -2);
dict.Add("TT", 5);
dict.Add("T-", -1);
dict.Add("-A", -3);
dict.Add("-C", -4);
dict.Add("-G", -2);
dict.Add("-T", -1);
dict.Add("--", 0);
#endregion Create Dictionary
SQ[0, 0] = 0;
for (int i = 1; i <= a.Length; i++)
SQ[i, 0] = dict["-" + a[i - 1].ToString()] + SQ[i - 1, 0];
for (int i = 1; i <= b.Length; i++)
SQ[0, i] = dict[b[i - 1].ToString() + "-"] + SQ[0, i - 1];
for (int i = 1; i <= a.Length; i++)
for (int j = 1; j <= b.Length; j++)
SQ[i, j] = Math.Max(SQ[i - 1, j - 1] + dict[a[i-1].ToString() + b[j-1]], Math.Max(SQ[i - 1, j] + dict[a[i-1] + "-"], SQ[i, j - 1] + dict["-" + b[j-1]]));
Console.WriteLine("Max Alignment Score : " + SQ[a.Length, b.Length]);
printAllAlignments(SQ, a, b, a.Length , b.Length,"","");
Console.Read();
}
}
有趣的问题。代码中的"动态编程"在哪里?
我不确定你在打印所有可能的对齐中寻找什么,但我下面的快速和肮脏的代码可能会有所帮助。它在两行内打印每次对齐,如下所示:
- 0
-
- -2
G
.
.
.
AGTGATG 8
-GTTTTA
AGTGATTG 14
-GTTTTAG
请注意,您提到的最大对齐组合不会显示在此输出中:
AGTGATG
-GTTA-G
这就是你所说的"打印所有可能对齐的问题"吗?
无论如何,这是我的代码(字典初始化删除):
public struct Alignment
{
public string substringA;
public string substringB;
public int key;
}
[MTAThread]
static void Main(string[] args)
{
//This has been assumed that the strings contain only A,C,G,T and -(?)..caps
Console.WriteLine("Enter first string : ");
var a = Console.ReadLine();
a = "-" + a;
Console.WriteLine("Enter second string : ");
var b = Console.ReadLine();
b = "-" + b;
Alignment[,] SQ = new Alignment[a.Length, b.Length];
#region Create Dictionary
...
#endregion Create Dictionary
for (int i = 0; i < a.Length; i++)
{
for (int j = 0; j < b.Length; j++)
{
int key = 0, key1 = 0, key2 = 0;
dict.TryGetValue(a[i].ToString() + b[j].ToString(), out key);
dict.TryGetValue("-" + b[j].ToString(), out key1);
dict.TryGetValue(a[i].ToString() + "-", out key2);
if (i == 0)
{
SQ[i, j].substringA = "-";
SQ[i, j].substringB = b[j].ToString();
SQ[i, j].key = key1;
}
else if (j == 0)
{
SQ[i, j].substringA = a[i].ToString();
SQ[i, j].substringB = "-";
SQ[i, j].key = key2;
}
else
{
// Get the maximum key value, and the substrings associated with it.
int score;
var score1 = SQ[i - 1, j].key + key1;
var score2 = SQ[i, j - 1].key + key2;
if (score1 >= score2)
{
SQ[i, j].substringA = SQ[i - 1, j].substringA;
SQ[i, j].substringB = SQ[i - 1, j].substringB;
score = score1;
}
else
{
SQ[i, j].substringA = SQ[i, j - 1].substringA;
SQ[i, j].substringB = SQ[i, j - 1].substringB;
score = score2;
}
var score3 = SQ[i - 1, j - 1].key + key;
if (score3 >= score)
{
SQ[i, j].substringA = SQ[i - 1, j - 1].substringA;
SQ[i, j].substringB = SQ[i - 1, j - 1].substringB;
score = score3;
}
SQ[i, j].substringA += a[i];
SQ[i, j].substringB += b[j];
SQ[i, j].key = score;
}
}
}
PrintAlignments(SQ, a.Length, b.Length);
Console.WriteLine("Alignment Score : " + SQ[a.Length - 1, b.Length - 1].key);
Console.Read();
}
private static void PrintAlignments(Alignment[,] SQ, int iLength, int jLength)
{
for (int i = 0; i < iLength; i++)
{
for (int j = 0; j < jLength; j++)
{
Console.WriteLine("{0} {1}", SQ[i, j].substringA, SQ[i, j].key);
Console.WriteLine("{0}", SQ[i, j].substringB);
Console.WriteLine();
}
}
}
每个状态(DP cell) X有3个前导状态;我们称它们为X1 X2 X3。我们称状态X的分数为s(X),从状态X到达邻近状态Y的成本为c(X, Y)。
对于任何给定的状态X,通常只有它的一个前体是最优的,当用它自己的得分加上从它到X的成本来衡量时。例如,可能是s(X1) + c(X1, X)> s(X2) + c(X2, X)和s(X1) + c(X1, X)> s(X3) + c(X3, X):在这种情况下,X1在X的3个前体中是唯一最优的,而回溯只需要遵循X1。
但是可能会发生2个最优相等,甚至3个最优相等的前辈,在这些情况下,遵循2或3个最优前辈中的任何一个将(通过DP公式的正确性)产生最优对齐。通常,您只对生成单一对齐感兴趣,因此在回溯期间,您只需任意选择一个最优的前身,并遵循它。但是,如果您想全部生成它们,则不需要这样做,而是需要循环遍历所有先前状态并递归处理每个状态。
只是因为我对这个问题很感兴趣,而且我对您的解决方案的方法很感兴趣,所以我很快为一些测试编写了一个蛮力解决方案。对于一些微不足道的问题,我发现我们会产生不同的解决方案。比如TA和AT。你的得分结果是1(我不知道它基于什么对齐),而我的是:
TA-
-AT
得分为3。也许我没有把你的问题理解对,如果你能检查一下,我会很高兴的。
static Dictionary<string, int> dict;
static void Main(string[] args)
{
//This has been assumed that the strings contain only A,C,G,T and -(?)..caps
Console.WriteLine("Enter first string : ");
string realInputA = Console.ReadLine();
string inputA = "-" + realInputA;
Console.WriteLine("Enter second string : ");
string realInputB = Console.ReadLine();
string inputB = "-" + realInputB;
int[,] scoreMatrix = new int[inputA.Length, inputB.Length];
#region Create Dictionary
dict = new Dictionary<string, int>();
dict.Add("AA", 5);
dict.Add("AC", -1);
dict.Add("AG", -2);
dict.Add("AT", -1);
dict.Add("A-", -3);
dict.Add("CA", -1);
dict.Add("CC", 5);
dict.Add("CG", -3);
dict.Add("CT", -2);
dict.Add("C-", -4);
dict.Add("GA", -2);
dict.Add("GC", -3);
dict.Add("GG", 5);
dict.Add("GT", -2);
dict.Add("G-", -2);
dict.Add("TA", -1);
dict.Add("TC", -2);
dict.Add("TG", -2);
dict.Add("TT", 5);
dict.Add("T-", -1);
dict.Add("-A", -3);
dict.Add("-C", -4);
dict.Add("-G", -2);
dict.Add("-T", -1);
dict.Add("--", 0);
#endregion Create Dictionary
for (int i = 0; i < inputA.Length; i++)
{
for (int j = 0; j < inputB.Length; j++)
{
int score = 0, score1 = 0, score2 = 0;
dict.TryGetValue(inputA[i].ToString() + inputB[j].ToString(), out score);
dict.TryGetValue("-" + inputB[j].ToString(), out score1);
dict.TryGetValue(inputA[i].ToString() + "-", out score2);
if (i == 0)
scoreMatrix[i, j] = score1;
else if (j == 0)
scoreMatrix[i, j] = score2;
else
scoreMatrix[i, j] = Math.Max(scoreMatrix[i - 1, j - 1] + score, Math.Max(scoreMatrix[i - 1, j] + score1, scoreMatrix[i, j - 1] + score2));
}
}
for (int i = 0; i < inputA.Length; i++)
{
for (int j = 0; j < inputB.Length; j++)
{
Console.Write(scoreMatrix[i, j] + " ");
}
Console.WriteLine();
}
Console.WriteLine("Alignment Score : " + scoreMatrix[inputA.Length - 1, inputB.Length - 1]);
printAllAlignments(realInputA, realInputB);
Console.Read();
}
static void printAllAlignments(string inputA, string inputB)
{
int minLen = Math.Max(inputA.Length, inputB.Length);
int maxLen = inputA.Replace("-", "").Length + inputB.Replace("-", "").Length;
Dictionary<string, int> solutions = new Dictionary<string, int>();
solutions = prepareStartSequences(inputA, inputB, minLen, solutions);
addLongerSequences(inputA, minLen, maxLen, solutions);
var solutionsOrdered = solutions.OrderByDescending(x => x.Value);
int maxScore = solutionsOrdered.First().Value;
foreach (var sol in solutionsOrdered.Where(x => x.Value == maxScore))
{
Console.WriteLine("{0}'n{1}'tScore: {2}'n'n", sol.Key.Split('|')[0], sol.Key.Split('|')[1], sol.Value);
}
}
private static void addLongerSequences(string inputA, int minLen, int maxLen, Dictionary<string, int> solutions)
{
for (int l = minLen + 1; l <= maxLen; l++)
{
List<Tuple<string, string>> currCombs = solutions
.Where(x => x.Key.Length / 2 + 1 == l)
.Select(x => x.Key.Split('|'))
.Select(x => new Tuple<string, string>(x[0], x[1]))
.ToList();
foreach (var comb in currCombs)
{
for (int idxA = 0; idxA <= inputA.Length; idxA++)
{
for (int idxB = 0; idxB <= inputA.Length; idxB++)
{
string cA = comb.Item1.Insert(idxA, "-");
string cB = comb.Item2.Insert(idxB, "-");
int score = getScore(cA, cB);
string key = cA + "|" + cB;
if (!solutions.ContainsKey(key) && score > int.MinValue)
{
solutions.Add(key, score);
}
}
}
}
}
}
private static Dictionary<string, int> prepareStartSequences(string inputA, string inputB, int minLen, Dictionary<string, int> solutions)
{
if (inputA.Length == inputB.Length)
solutions.Add(inputA + "|" + inputB, getScore(inputA, inputB));
else
{
string shorter = inputA.Length > inputB.Length ? inputB : inputA;
string longer = inputA.Length > inputB.Length ? inputA : inputB;
int shortLen = shorter.Length;
List<string> combinations = new List<string>();
combinations.Add(shorter);
while (shortLen < minLen)
{
List<string> tmpCombinations = new List<string>();
foreach (string str in combinations)
{
for (int i = 0; i <= shortLen; i++)
{
tmpCombinations.Add(str.Insert(i, "-"));
}
}
combinations = tmpCombinations.Distinct().ToList();
shortLen++;
}
foreach (var comb in combinations)
{
if (inputA.Length > inputB.Length)
{
solutions.Add(longer + "|" + comb, getScore(longer, comb));
}
else
{
solutions.Add(comb + "|" + longer, getScore(comb, longer));
}
}
}
solutions = solutions.Where(x => x.Value != int.MinValue).ToDictionary(x => x.Key, y => y.Value);
return solutions;
}
static int getScore(string inputA, string inputB)
{
int result = 0;
for (int i = 0; i < inputA.Length; i++)
{
string key = inputA[i].ToString() + inputB[i].ToString();
if (key == "--") return int.MinValue;
result += dict.ContainsKey(key) ? dict[key] : 0;
}
return result;
}