Split a string by spaces — preserving quoted substrings — in Python
我有一个像这样的字符串:
我正在尝试在Python中编写一些内容,以按空格将其拆分,同时忽略引号内的空格。 我正在寻找的结果是:
PS。 我知道您会问:"如果引号内有引号,那么在我的应用程序中,那将永远不会发生。
您要从shlex模块拆分。
1 2 3
| >>> import shlex
>>> shlex.split('this is"a test"')
['this', 'is', 'a test'] |
这应该正是您想要的。
看一下shlex模块,尤其是shlex.split。
1 2 3
| >>> import shlex
>>> shlex.split('this is"a test"')
['this', 'is', 'a test'] |
我在这里看到正则表达式方法看起来很复杂和/或错误。这让我感到惊讶,因为正则表达式语法可以轻松地描述"空格或引号引起的东西",并且大多数正则表达式引擎(包括Python的)都可以在正则表达式上拆分。因此,如果您要使用正则表达式,为什么不直接说出您的意思呢?:
1 2 3 4
| test = 'this is"a test"' # or"this is 'a test'"
# pieces = [p for p in re.split("( |[\\"'].*[\\"'])", test) if p.strip()]
# From comments, use this:
pieces = [p for p in re.split("( |\\".*?\\"|'.*?')", test) if p.strip()] |
说明:
1 2 3 4
| [\\"'] = double-quote or single-quote
.* = anything
( |X) = space or X
.strip() = remove space and empty-string separators |
shlex可能提供更多功能。
根据您的用例,您可能还需要检出csv模块:
1 2 3 4
| import csv
lines = ['this is"a string"', 'and more"stuff"']
for row in csv.reader(lines, delimiter=""):
print row |
输出:
1 2
| ['this', 'is', 'a string']
['and', 'more', 'stuff'] |
我使用shlex.split来处理70,000,000行的鱿鱼日志,它是如此之慢。所以我改了。
如果shlex有性能问题,请尝试此操作。
1 2 3 4
| import re
def line_split(line):
return re.findall(r'[^"\s]\S*|".+?"', line) |
由于此问题是用正则表达式标记的,因此我决定尝试使用正则表达式方法。我首先用 x00替换引号部分中的所有空格,然后按空格分割,然后将 x00替换回每个部分中的空格。
两种版本都做同样的事情,但是splitter2比splitter2更具可读性。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| import re
s = 'this is"a test" some text"another test"'
def splitter(s):
def replacer(m):
return m.group(0).replace("","\x00")
parts = re.sub('".+?"', replacer, s).split()
parts = [p.replace("\x00","") for p in parts]
return parts
def splitter2(s):
return [p.replace("\x00","") for p in re.sub('".+?"', lambda m: m.group(0).replace("","\x00"), s).split()]
print splitter2(s) |
出于性能原因,似乎re更快。 这是我使用最小贪心运算符保留外引号的解决方案:
1
| re.findall("(?:".*?"|\S)+", s) |
结果:
1
| ['this', 'is', '"a test"'] |
它将aaa"bla blub"bbb之类的构造保留在一起,因为这些标记之间没有空格。 如果字符串包含转义字符,则可以这样进行匹配:
1 2 3 4 5 6 7 8
| >>> a ="She said "He said, \\"My name is Mark.\\"""
>>> a
'She said"He said, \"My name is Mark.\""'
>>> for i in re.findall("(?:".*?[^\\\\]"|\S)+", a): print(i)
...
She
said
"He said, "My name is Mark."" |
请注意,这也通过模式的\S部分匹配空字符串""。
接受的shlex方法的主要问题是它不会忽略带引号的子字符串之外的转义字符,并且在某些特殊情况下会产生一些意外的结果。
我有以下用例,在这里我需要一个拆分函数,该函数拆分输入字符串,以便保留单引号或双引号的子字符串,并能够在这样的子字符串中转义引号。无引号的字符串中的引号不应与其他任何字符区别对待。带有预期输出的一些示例测试用例:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
| input string | expected output
===============================================
'abc def' | ['abc', 'def']
"abc \\s def" | ['abc', '\\s', 'def']
'"abc def" ghi' | ['abc def', 'ghi']
"'abc def' ghi" | ['abc def', 'ghi']
'"abc \" def" ghi' | ['abc" def', 'ghi']
"'abc \' def' ghi" | ["abc ' def", 'ghi']
"'abc \\s def' ghi" | ['abc \\s def', 'ghi']
'"abc \\s def" ghi' | ['abc \\s def', 'ghi']
'"" test' | ['', 'test']
"'' test" | ['', 'test']
"abc'def" | ["abc'def"]
"abc'def'" | ["abc'def'"]
"abc'def' ghi" | ["abc'def'", 'ghi']
"abc'def'ghi" | ["abc'def'ghi"]
'abc"def' | ['abc"def']
'abc"def"' | ['abc"def"']
'abc"def" ghi' | ['abc"def"', 'ghi']
'abc"def"ghi' | ['abc"def"ghi']
"r'AA' r'.*_xyz$'" | ["r'AA'","r'.*_xyz$'"] |
我最终得到了以下函数来拆分字符串,以便所有输入字符串的预期输出结果:
1 2 3 4 5 6 7 8 9
| import re
def quoted_split(s):
def strip_quotes(s):
if s and (s[0] == '"' or s[0] =="'") and s[0] == s[-1]:
return s[1:-1]
return s
return [strip_quotes(p).replace('\"', '"').replace("\'","'") \
for p in re.findall(r'"(?:\\.|[^"])*"|\'(?:\\.|[^\'])*\'|[^\s]+', s)] |
以下测试应用程序检查其他方法(现在为shlex和csv)和自定义拆分实现的结果:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
| #!/bin/python2.7
import csv
import re
import shlex
from timeit import timeit
def test_case(fn, s, expected):
try:
if fn(s) == expected:
print '[ OK ] %s -> %s' % (s, fn(s))
else:
print '[FAIL] %s -> %s' % (s, fn(s))
except Exception as e:
print '[FAIL] %s -> exception: %s' % (s, e)
def test_case_no_output(fn, s, expected):
try:
fn(s)
except:
pass
def test_split(fn, test_case_fn=test_case):
test_case_fn(fn, 'abc def', ['abc', 'def'])
test_case_fn(fn,"abc \\s def", ['abc', '\\s', 'def'])
test_case_fn(fn, '"abc def" ghi', ['abc def', 'ghi'])
test_case_fn(fn,"'abc def' ghi", ['abc def', 'ghi'])
test_case_fn(fn, '"abc \" def" ghi', ['abc" def', 'ghi'])
test_case_fn(fn,"'abc \' def' ghi", ["abc ' def", 'ghi'])
test_case_fn(fn,"'abc \\s def' ghi", ['abc \\s def', 'ghi'])
test_case_fn(fn, '"abc \\s def" ghi', ['abc \\s def', 'ghi'])
test_case_fn(fn, '"" test', ['', 'test'])
test_case_fn(fn,"'' test", ['', 'test'])
test_case_fn(fn,"abc'def", ["abc'def"])
test_case_fn(fn,"abc'def'", ["abc'def'"])
test_case_fn(fn,"abc'def' ghi", ["abc'def'", 'ghi'])
test_case_fn(fn,"abc'def'ghi", ["abc'def'ghi"])
test_case_fn(fn, 'abc"def', ['abc"def'])
test_case_fn(fn, 'abc"def"', ['abc"def"'])
test_case_fn(fn, 'abc"def" ghi', ['abc"def"', 'ghi'])
test_case_fn(fn, 'abc"def"ghi', ['abc"def"ghi'])
test_case_fn(fn,"r'AA' r'.*_xyz$'", ["r'AA'","r'.*_xyz$'"])
def csv_split(s):
return list(csv.reader([s], delimiter=' '))[0]
def re_split(s):
def strip_quotes(s):
if s and (s[0] == '"' or s[0] =="'") and s[0] == s[-1]:
return s[1:-1]
return s
return [strip_quotes(p).replace('\"', '"').replace("\'","'") for p in re.findall(r'"(?:\\.|[^"])*"|\'(?:\\.|[^\'])*\'|[^\s]+', s)]
if __name__ == '__main__':
print 'shlex
'
test_split(shlex.split)
print
print 'csv
'
test_split(csv_split)
print
print 're
'
test_split(re_split)
print
iterations = 100
setup = 'from __main__ import test_split, test_case_no_output, csv_split, re_split
import shlex, re'
def benchmark(method, code):
print '%s: %.3fms per iteration' % (method, (1000 * timeit(code, setup=setup, number=iterations) / iterations))
benchmark('shlex', 'test_split(shlex.split, test_case_no_output)')
benchmark('csv', 'test_split(csv_split, test_case_no_output)')
benchmark('re', 'test_split(re_split, test_case_no_output)') |
输出:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
| shlex
[ OK ] abc def -> ['abc', 'def']
[FAIL] abc \s def -> ['abc', 's', 'def']
[ OK ]"abc def" ghi -> ['abc def', 'ghi']
[ OK ] 'abc def' ghi -> ['abc def', 'ghi']
[ OK ]"abc " def" ghi -> ['abc" def', 'ghi']
[FAIL] 'abc \' def' ghi -> exception: No closing quotation
[ OK ] 'abc \s def' ghi -> ['abc \\s def', 'ghi']
[ OK ]"abc \s def" ghi -> ['abc \\s def', 'ghi']
[ OK ]"" test -> ['', 'test']
[ OK ] '' test -> ['', 'test']
[FAIL] abc'def -> exception: No closing quotation
[FAIL] abc'def' -> ['abcdef']
[FAIL] abc'def' ghi -> ['abcdef', 'ghi']
[FAIL] abc'def'ghi -> ['abcdefghi']
[FAIL] abc"def -> exception: No closing quotation
[FAIL] abc"def" -> ['abcdef']
[FAIL] abc"def" ghi -> ['abcdef', 'ghi']
[FAIL] abc"def"ghi -> ['abcdefghi']
[FAIL] r'AA' r'.*_xyz$' -> ['rAA', 'r.*_xyz$']
csv
[ OK ] abc def -> ['abc', 'def']
[ OK ] abc \s def -> ['abc', '\\s', 'def']
[ OK ]"abc def" ghi -> ['abc def', 'ghi']
[FAIL] 'abc def' ghi -> ["'abc","def'", 'ghi']
[FAIL]"abc " def" ghi -> ['abc \', 'def"', 'ghi']
[FAIL] 'abc \' def' ghi -> ["'abc","\'","def'", 'ghi']
[FAIL] 'abc \s def' ghi -> ["'abc", '\\s',"def'", 'ghi']
[ OK ]"abc \s def" ghi -> ['abc \\s def', 'ghi']
[ OK ]"" test -> ['', 'test']
[FAIL] '' test -> ["''", 'test']
[ OK ] abc'def -> ["abc'def"]
[ OK ] abc'def' -> ["abc'def'"]
[ OK ] abc'def' ghi -> ["abc'def'", 'ghi']
[ OK ] abc'def'ghi -> ["abc'def'ghi"]
[ OK ] abc"def -> ['abc"def']
[ OK ] abc"def" -> ['abc"def"']
[ OK ] abc"def" ghi -> ['abc"def"', 'ghi']
[ OK ] abc"def"ghi -> ['abc"def"ghi']
[ OK ] r'AA' r'.*_xyz$' -> ["r'AA'","r'.*_xyz$'"]
re
[ OK ] abc def -> ['abc', 'def']
[ OK ] abc \s def -> ['abc', '\\s', 'def']
[ OK ]"abc def" ghi -> ['abc def', 'ghi']
[ OK ] 'abc def' ghi -> ['abc def', 'ghi']
[ OK ]"abc " def" ghi -> ['abc" def', 'ghi']
[ OK ] 'abc \' def' ghi -> ["abc ' def", 'ghi']
[ OK ] 'abc \s def' ghi -> ['abc \\s def', 'ghi']
[ OK ]"abc \s def" ghi -> ['abc \\s def', 'ghi']
[ OK ]"" test -> ['', 'test']
[ OK ] '' test -> ['', 'test']
[ OK ] abc'def -> ["abc'def"]
[ OK ] abc'def' -> ["abc'def'"]
[ OK ] abc'def' ghi -> ["abc'def'", 'ghi']
[ OK ] abc'def'ghi -> ["abc'def'ghi"]
[ OK ] abc"def -> ['abc"def']
[ OK ] abc"def" -> ['abc"def"']
[ OK ] abc"def" ghi -> ['abc"def"', 'ghi']
[ OK ] abc"def"ghi -> ['abc"def"ghi']
[ OK ] r'AA' r'.*_xyz$' -> ["r'AA'","r'.*_xyz$'"]
shlex: 0.281ms per iteration
csv: 0.030ms per iteration
re: 0.049ms per iteration |
因此,性能要比shlex好得多,并且可以通过预编译正则表达式来进一步提高性能,在这种情况下,它的性能将优于csv方法。
要保留引号,请使用以下功能:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| def getArgs(s):
args = []
cur = ''
inQuotes = 0
for char in s.strip():
if char == ' ' and not inQuotes:
args.append(cur)
cur = ''
elif char == '"' and not inQuotes:
inQuotes = 1
cur += char
elif char == '"' and inQuotes:
inQuotes = 0
cur += char
else:
cur += char
args.append(cur)
return args |
为了解决某些Python 2版本中的unicode问题,我建议:
1 2
| from shlex import split as _split
split = lambda a: [b.decode('utf-8') for b in _split(a.encode('utf-8'))] |
速度测试的不同答案:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
| import re
import shlex
import csv
line = 'this is"a test"'
%timeit [p for p in re.split("( |\\".*?\\"|'.*?')", line) if p.strip()]
100000 loops, best of 3: 5.17 μs per loop
%timeit re.findall(r'[^"\s]\S*|".+?"', line)
100000 loops, best of 3: 2.88 μs per loop
%timeit list(csv.reader([line], delimiter=""))
The slowest run took 9.62 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.4 μs per loop
%timeit shlex.split(line)
10000 loops, best of 3: 50.2 μs per loop |
上面讨论的shlex的unicode问题(最佳答案)似乎在2.7.2+中已(间接)解决了
http://bugs.python.org/issue6988#msg146200
(因为我无法发表评论,所以单独回答)
嗯,似乎无法找到" Reply"按钮……无论如何,此答案基于Kate的方法,但是正确地将字符串与包含转义引号的子字符串分开,并且还删除了子字符串的开始和结束引号:
1
| [i.strip('"').strip("'") for i in re.split(r'(\s+|(?<!\\)".*?(?<!\\)"|(?<!\\)\'.*?(?<!\\)\')', string) if i.strip()] |
这适用于'This is" a \\\"test\\\"\\\'s substring"'之类的字符串(不幸的是,必须使用疯狂的标记来防止Python删除转义符)。
如果不需要返回列表中的字符串中的结果转义符,则可以使用此函数的稍有改动的版本:
1
| [i.strip('"').strip("'").decode('string_escape') for i in re.split(r'(\s+|(?<!\\)".*?(?<!\\)"|(?<!\\)\'.*?(?<!\\)\')', string) if i.strip()] |
我建议:
测试字符串:
1
| s = 'abc"ad" \'fg\'"kk\'rdt\'" zzz"34"zzz"" \'\'' |
还要捕获"和":
1 2
| import re
re.findall(r'"[^"]*"|\'[^\']*\'|[^"\'\s]+',s) |
结果:
1
| ['abc', '"ad"',"'fg'", '"kk\'rdt\'"', 'zzz', '"34"', 'zzz', '""',"''"] |
忽略空的"和":
1 2
| import re
re.findall(r'"[^"]+"|\'[^\']+\'|[^"\'\s]+',s) |
结果:
1
| ['abc', '"ad"',"'fg'", '"kk\'rdt\'"', 'zzz', '"34"', 'zzz'] |
如果您不关心子字符串,除了简单
1
| >>> 'a short sized string with spaces '.split() |
性能:
1 2 3 4
| >>> s =" ('a short sized string with spaces '*100).split()"
>>> t = timeit.Timer(stmt=s)
>>> print"%.2f usec/pass" % (1000000 * t.timeit(number=100000)/100000)
171.39 usec/pass |
或字符串模块
1 2
| >>> from string import split as stringsplit;
>>> stringsplit('a short sized string with spaces '*100) |
性能:字符串模块似乎比字符串方法的性能更好
1 2 3 4
| >>> s ="stringsplit('a short sized string with spaces '*100)"
>>> t = timeit.Timer(s,"from string import split as stringsplit")
>>> print"%.2f usec/pass" % (1000000 * t.timeit(number=100000)/100000)
154.88 usec/pass |
或者您可以使用RE引擎
1 2 3 4
| >>> from re import split as resplit
>>> regex = '\s+'
>>> medstring = 'a short sized string with spaces '*100
>>> resplit(regex, medstring) |
性能
1 2 3 4
| >>> s ="resplit(regex, medstring)"
>>> t = timeit.Timer(s,"from re import split as resplit; regex='\s+'; medstring='a short sized string with spaces '*100")
>>> print"%.2f usec/pass" % (1000000 * t.timeit(number=100000)/100000)
540.21 usec/pass |
对于非常长的字符串,您不应将整个字符串加载到内存中,而应拆分行或使用迭代循环
尝试这个:
1 2 3 4 5 6 7 8 9 10
| def adamsplit(s):
result = []
inquotes = False
for substring in s.split('"'):
if not inquotes:
result.extend(substring.split())
else:
result.append(substring)
inquotes = not inquotes
return result |
一些测试字符串:
1 2
| 'This is"a test"' -> ['This', 'is', 'a test']
'"This is \'a test\'"' -> ["This is 'a test'"] |
|