字符串按长度拆分,仅按最近的空格拆分
本文关键字:拆分 最近 空格 字符串 | 更新日期: 2023-09-27 18:28:54
我有一个类似文本的
var data = "âô¢¬ôè÷¢ : ªîø¢è¤ô¢ - ã¿ñ¬ô ñèù¢ ªð¼ñ£÷¢ ï¤ôñ¢,«ñø¢è¤ô¢ - ªð¼ñ£÷¢ ñèù¢ ÝÁºèñ¢ ï¤ô袰ñ¢ ñ¤ì¢ì£ Üò¢òñ¢ ªð¼ñ£ñ¢ð좮 è¤ó£ñ âô¢¬ô袰ñ¢,õìè¢è¤ô¢ - ÝÁºèñ¢ ï¤ôñ¢,è¤öè¢è¤ô¢ - ô좲ñ¤ ï¤ôñ¢ ñø¢Áñ¢ 1,22 ªê ï¤ôñ¢ ð£î¢î¤òñ¢";
我正在使用扩展方法来拆分字符串
public static IEnumerable<string> EnumByLength(this string s, int length)
{
for (int i = 0; i < s.Length; i += length)
{
if (i + length <= s.Length)
{
yield return s.Substring(i, length);
}
else
{
yield return s.Substring(i);
}
}
}
public static string[] SplitByLength(this string s, int maxLen)
{
var v = EnumByLength(s, maxLen);
if (v == null)
return new string[] { s };
else
return s.EnumByLength(maxLen).ToArray();
}
现在我的问题是
若要按最大长度150
拆分此字符串,则必须仅按其中最近的空间进行拆分。..(在150
之前或150
之后。..不在单词中间。
如何?
我的版本:
// Enumerate by nearest space
// Split String value by closest to length spaces
// e.g. for length = 3
// "abcd efghihjkl m n p qrstsf" -> "abcd", "efghihjkl", "m n", "p", "qrstsf"
public static IEnumerable<String> EnumByNearestSpace(this String value, int length) {
if (String.IsNullOrEmpty(value))
yield break;
int bestDelta = int.MaxValue;
int bestSplit = -1;
int from = 0;
for (int i = 0; i < value.Length; ++i) {
var Ch = value[i];
if (Ch != ' ')
continue;
int size = (i - from);
int delta = (size - length > 0) ? size - length : length - size;
if ((bestSplit < 0) || (delta < bestDelta)) {
bestSplit = i;
bestDelta = delta;
}
else {
yield return value.Substring(from, bestSplit - from);
i = bestSplit;
from = i + 1;
bestSplit = -1;
bestDelta = int.MaxValue;
}
}
// String's tail
if (from < value.Length) {
if (bestSplit >= 0) {
if (bestDelta < value.Length - from)
yield return value.Substring(from, bestSplit - from);
from = bestSplit + 1;
}
if (from < value.Length)
yield return value.Substring(from);
}
}
...
var list = data.EnumByNearestSpace(150).ToList();
我的版本
var data = "âô¢¬ôè÷¢ : ªîø¢è¤ô¢ - ã¿ñ¬ô ñèù¢ ªð¼ñ£÷¢ ï¤ôñ¢,«ñø¢è¤ô¢ - ªð¼ñ£÷¢ ñèù¢ ÝÁºèñ¢ ï¤ô袰ñ¢ ñ¤ì¢ì£ Üò¢òñ¢ ªð¼ñ£ñ¢ð좮 è¤ó£ñ âô¢¬ô袰ñ¢,õìè¢è¤ô¢ - ÝÁºèñ¢ ï¤ôñ¢,è¤öè¢è¤ô¢ - ô좲ñ¤ ï¤ôñ¢ ñø¢Áñ¢ 1,22 ªê ï¤ôñ¢ ð£î¢î¤òñ¢";
var indexes = new List<int>();
var lastFoundIndex = 0;
while((lastFoundIndex = data.IndexOf(' ', lastFoundIndex + 1)) != -1)
{
indexes.Add(lastFoundIndex);
}
int intNum = 150;
int index;
var newList = new List<string>();
while ((index = indexes.Where(x => x > intNum - 150 && x <= intNum).LastOrDefault()) != 0)
{
var firstIndex = newList.Count == 0 ? 0 : index;
var lastIndex = firstIndex + 150 >= data.Length ? data.Length - 150 : intNum;
newList.Add(data.Substring(intNum - 150, lastIndex));
intNum += 150;
}
newList
包含拆分字符串
老话题,但我只是遇到了同样的问题,并试图自己解决。这是我的方法,如果任何单词超过当前限制,它也会抛出错误。
static void Main(string[] args)
{
string veryLongText = @"Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua.";
var result = SplitString(veryLongText, 20);
if (result != null)
foreach (var t in result)
Console.WriteLine($"{t.Length, 3} : '{t}'");
Console.ReadLine();
}
private static List<string> SplitString(string data, int length)
{
List<string> result = new List<string>();
if (data.Split(' ').Any(x => x.Length > length))
{
Console.WriteLine("ERROR, SINGLE WORD EXCEED THE CURRENT LIMIT!");
return null;
}
int lastSpace = 0;
int currentSpace = 0;
int newLinePos = 0;
for (int i = 0; i < data.Length; i++)
{
if (data.Length - newLinePos <= length)
{
result.Add(data.Substring(newLinePos, data.Length - newLinePos));
break;
}
if (data[i] == ' ')
{
lastSpace = currentSpace;
currentSpace = i;
if (currentSpace - newLinePos > length)
{
result.Add(data.Substring(newLinePos, lastSpace - newLinePos));
newLinePos = lastSpace + 1;
}
}
}
return result;
}
好了:
for (int i = 0; i < s.Length; i += length)
{
int index=s.IndexOf(" ",i, s.Length-i)
if (index!=-1 && index + length <= s.Length)
{
i =index;
yield return s.Substring(index, length);
}
else
{
index= s.LastIndexOf(" ", 0, i);
if(index==-1)
yield return s.Substring(i);
else
{
i = index;
yield return s.Substring(i);
}
}
}
我的字符串扩展名:
public static string TrimAtNearestWhiteSpace(this string src, int pos)
{
string retval = src;
if (!string.IsNullOrEmpty(src) && src.Length > pos)
{
//get a sorted list of white space indexes
var whiteSpaceIndexes = new List<int>();
for (int i = 0; i < src.Length; i++)
if (src[i] == ' ') whiteSpaceIndexes.Add(i);
// let the whole source be an option if close to target position
whiteSpaceIndexes.Add(src.Length);
//compare nearest white space positions
var nextSpace = whiteSpaceIndexes.FirstOrDefault(x => x >= pos);
whiteSpaceIndexes.Reverse();
var prevSpace = whiteSpaceIndexes.FirstOrDefault(x => x < pos);
var bestDelta = nextSpace - pos < pos - prevSpace ? nextSpace : prevSpace;
//add ellipsis if return value is trimmed
if(bestDelta < src.Length)
retval = src.Substring(0, bestDelta) + "...";
}
return retval;
}
用法:
var source = "Lorem ipsum dolor sit amet, consectetur adipiscing elit";
var readmore = source.TrimAtNearestWhiteSpace(6);
试试这个,这个代码会把长句子分成一行,最多不超过chunksize的单词:
private List<string> splitIntoChunks(string toSplit, int chunkSize)
{
List<string> splittedLines = new List<string>();
string [] toSplitAr = toSplit.Split(new char[] { ' ', ''t' }, StringSplitOptions.RemoveEmptyEntries);
for (int i = 0; i < toSplitAr.Length; )
{
string line = "";
string prefix = "";
for (int linesize = 0; linesize <= chunkSize;)
{
if (i >= toSplitAr.Length) break; //i should not exceed splited array
prefix = (line == "" ? "" : " "); //prefix with space if not first word in line
linesize += toSplitAr[i].Length;
if (linesize > chunkSize) break; //line size should not exceed chunksize
line += (prefix + toSplitAr[i]);
i++;
}
splittedLines.Add(line);
}
return splittedLines;
}