在 C# 中从 CP1251 转换为 Unicode
本文关键字:Unicode 转换 CP1251 中从 | 更新日期: 2023-09-27 18:35:48
我有包含俄语文本的大数据
"Ìèíàñÿí Ðóäèê Ñàðêèñîâ"
我需要在 C# 中将其转换为 UNICODE
"Минасян рудик саркисов"
如何转换它?
如果它真的是 1251,那么代码是:
var enc1251 = Encoding.GetEncoding(1251);
var enc8859 = Encoding.GetEncoding("iso-8859-1");
string str = "Ìèíàñÿí Ðóäèê Ñàðêèñîâ";
byte[] bytes = enc8859.GetBytes(str);
string str2 = enc1251.GetString(bytes);
Encoding.GetEncoding("iso-8859-1").GetBytes(str)
返回"原始"(未处理)byte[]
数组,然后我用 CP1251 对其进行解码。
我正在添加一个小程序来"解决"此类问题。请注意,这是一个.NET Framework程序,而不是.NET Core程序,因为.NET Core在Encoding.GetEncodings
有问题。该程序将搜索可用于修复错误编码文本问题的编码。它将显示一个候选编码列表,这些编码可以编码/解码给定的全文,然后尝试匹配它们。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
class Program
{
static void Main(string[] args)
{
Console.OutputEncoding = System.Text.Encoding.UTF8;
var encodings = Encoding.GetEncodings();
Console.WriteLine($"Testing {encodings.Length} encodings");
Console.WriteLine();
string from = "Ìèíàñÿí Ðóäèê Ñàðêèñîâ"; // "Єюээ";
string to = "Минасян Рудик Саркисов"; // "тонн";
var encodingsFrom = new List<(string Hex, Encoding Encoding)>();
var encodingsTo = new List<(string Hex, Encoding Encoding)>();
var stringPlusEncodings = new[]
{
new { Str = from, Encodings = encodingsFrom },
new { Str = to, Encodings = encodingsTo },
};
foreach (var stringPlusEncoding in stringPlusEncodings)
{
Console.WriteLine(stringPlusEncoding.Str);
foreach (var info in encodings)
{
var enc = info.GetEncoding();
bool unicodeEncoding = enc.BodyName.StartsWith("utf-");
if (!enc.IsSingleByte && !unicodeEncoding)
{
Console.WriteLine($"Skipped {enc.BodyName}");
continue;
}
enc = (Encoding)enc.Clone();
// We replace unknown characters with easy-to-find code 0
// Note that this is useless for encodings that map all the characters
// and use the 0 for something else (like utf-16 and utf-32)
enc.EncoderFallback = new EncoderReplacementFallback("'0");
var bytes = enc.GetBytes(stringPlusEncoding.Str);
if (!unicodeEncoding && (bytes.Length == 0 || bytes.Any(x => x == 0)))
{
continue;
}
// Write in hex format
string encodedHex = string.Join(" ", bytes.Select(x => x.ToString("x2")));
Console.WriteLine($"{encodedHex} {enc.HeaderName}");
stringPlusEncoding.Encodings.Add((encodedHex, enc));
}
Console.WriteLine(string.Empty);
}
Console.WriteLine("Candidates:");
foreach (var encodingFrom in encodingsFrom)
{
var encodingsTo2 = encodingsTo.Where(x => encodingFrom.Hex == x.Hex).ToArray();
foreach (var encodingTo in encodingsTo2)
{
Console.WriteLine($"{encodingFrom.Encoding.HeaderName} -> {encodingTo.Encoding.HeaderName}");
}
}
}
}