c#中使用空格分割字符串
本文关键字:分割 字符串 空格 | 更新日期: 2023-09-27 18:18:57
我想用空格分割字符串,除非字符串中的文本是双引号 ("text")或单引号 ("text")。
我用这个函数来做:
public static string[] ParseKeywordExpression(string keywordExpressionValue, bool isUniqueKeywordReq)
{
keywordExpressionValue = keywordExpressionValue.Trim();
if (keywordExpressionValue == null || !(keywordExpressionValue.Length > 0))
return new string[0];
int idx = keywordExpressionValue.Trim().IndexOf(" ");
if (idx == -1)
return new string[] { keywordExpressionValue };
//idx = idx + 1;
int count = keywordExpressionValue.Length;
ArrayList extractedList = new ArrayList();
while (count > 0)
{
if (keywordExpressionValue[0] == '"')
{
int temp = keywordExpressionValue.IndexOf(BACKSLASH, 1, keywordExpressionValue.Length - 1);
while (keywordExpressionValue[temp - 1] == '''')
{
temp = keywordExpressionValue.IndexOf(BACKSLASH, temp + 1, keywordExpressionValue.Length - temp - 1);
}
idx = temp + 1;
}
if (keywordExpressionValue[0] == '''')
{
int temp = keywordExpressionValue.IndexOf(BACKSHASH_QUOTE, 1, keywordExpressionValue.Length - 1);
while (keywordExpressionValue[temp - 1] == '''')
{
temp = keywordExpressionValue.IndexOf(BACKSHASH_QUOTE, temp + 1, keywordExpressionValue.Length - temp - 1);
}
idx = temp + 1;
}
string s = keywordExpressionValue.Substring(0, idx);
int left = count - idx;
keywordExpressionValue = keywordExpressionValue.Substring(idx, left).Trim();
if (isUniqueKeywordReq)
{
if (!extractedList.Contains(s.Trim('"')))
{
extractedList.Add(s.Trim('"'));
}
}
else
{
extractedList.Add(s.Trim('"'));
}
count = keywordExpressionValue.Length;
idx = keywordExpressionValue.IndexOf(SPACE);
if (idx == -1)
{
string add = keywordExpressionValue.Trim('"', ' ');
if (add.Length > 0)
{
if (isUniqueKeywordReq )
{
if (!extractedList.Contains(add))
{
extractedList.Add(add);
}
}
else
{
extractedList.Add(add);
}
}
break;
}
}
return (string[])extractedList.ToArray(typeof(string));
}
是否有其他方法可以做到这一点,或者可以优化这个函数?
例如,我希望拆分字符串
%ABC% %aasdf% aalasdjjfas "c:'Document and Setting'Program Files' ABC .exe"
% ABC %
% aasdf %
aalasdjjfas
"c:'Document and Setting'Program Files'abc.exe"
最简单的正则表达式,处理单引号和双引号:
("((''")|([^"]))*")|('((''')|([^']))*')|('S+)
var regex = new Regex(@"(""((''"")|([^""]))*"")|('((''')|([^']))*')|('S+)");
var matches = regex.Matches(inputstring);
foreach (Match match in matches) {
extractedList.Add(match.Value);
}
所以基本上四到五行代码就足够了。
表达式,解释:
Main structure:
("((''")|([^"]))*") Double-quoted token
| , or
('((''')|([^']))*') single-quoted token
| , or
('S+) any group of non-space characters
Double-quoted token:
( Group starts
" Initial double-quote
( Inner group starts
(''") Either a backslash followed by a double-quote
| , or
([^"]) any non-double-quote character
)* The inner group repeats any number of times (or zero)
" Ending double-quote
)
Single-quoted token:
( Group starts
' Initial single-quote
( Inner group starts
(''') Either a backslash followed by a single-quote
| , or
([^']) any non-single-quote character
)* The inner group repeats any number of times (or zero)
' Ending single-quote
)
Non-space characters:
( Group starts
'S Non-white-space character
+ , repeated at least once
) Group ends
如果你不喜欢RegEx,这个方法应该能够分割带引号的字符串,并且忽略连续的空格:
public IEnumerable<string> SplitString(string input)
{
var isInDoubleQuote = false;
var isInSingleQuote = false;
var sb = new StringBuilder();
foreach (var c in input)
{
if (!isInDoubleQuote && c == '"')
{
isInDoubleQuote = true;
sb.Append(c);
}
else if (isInDoubleQuote)
{
sb.Append(c);
if (c != '"')
continue;
if (sb.Length > 2)
yield return sb.ToString();
sb = sb.Clear();
isInDoubleQuote = false;
}
else if (!isInSingleQuote && c == '''')
{
isInSingleQuote = true;
sb.Append(c);
}
else if (isInSingleQuote)
{
sb.Append(c);
if (c != '''')
continue;
if (sb.Length > 2)
yield return sb.ToString();
sb = sb.Clear();
isInSingleQuote = false;
}
else if (c == ' ')
{
if (sb.Length == 0)
continue;
yield return sb.ToString();
sb.Clear();
}
else
sb.Append(c);
}
if (sb.Length > 0)
yield return sb.ToString();
}
编辑:将返回类型更改为IEnumerable,使用yield和StringBuilder
我使用字符串中'x27
和'x22
的十六进制值来转义单引号和双引号。它使模式的c#字面文本更容易阅读和操作。
还使用了IgnorePatternWhitespace
,因为它允许注释模式以获得更好的可读性;不影响正则表达式的处理。
string data = @"'single' %ABC% %aasdf% aalasdjjfas ""c:'Document and Setting'Program Files'abc.exe""";
string pattern = @"(?xm) # Tell the regex compiler we are commenting (x = IgnorePatternWhitespace)
# and tell the compiler this is multiline (m),
# In Multiline the ^ matches each start line and $ is each EOL
# -Pattern Start-
^( # Start at the beginning of the line always
(?!['r'n]|$) # Stop the match if EOL or EOF found.
(?(['x27'x22]) # Regex If to check for single/double quotes
(?:['x27'x22]) # ''x27''x22 are single/double quotes
(?<Token>[^'x27'x22]+) # Match this in the quotes and place in Named match Token
(?:['x27'x22])
| # or (else) part of If when Not within quotes
(?<Token>[^'s'r'n]+) # Not within quotes, but put it in the Token match group
) # End of Pattern OR
(?:'s?) # Either a space or EOL/EOF
)+ # 1 or more tokens of data.
";
Console.WriteLine( string.Join(" | ",
Regex.Match(data, pattern)
.Groups["Token"]
.Captures
.OfType<Capture>()
.Select( cp => cp.Value )
)
);
/* Output
single | %ABC% | %aasdf% | aalasdjjfas | c:'Document and Setting'Program Files'abc.exe
*/
以上是基于以下两篇我写的博客文章:
- 使用正则表达式将CSV数据提取到Linq和Dictionary中
- 正则表达式和If条件