Stripping control characters in Python
Want to remove ASCII control characters from a string? Have Unicode control characters that can't be encoded into XML? Here is how to strip them with regex in Python.
def strip_control_characters(input): if input: import re # unicode invalid characters RE_XML_ILLEGAL = u'([\u0000-\u0008\u000b-\u000c\u000e-\u001f\ufffe-\uffff])' + \ u'|' + \ u'([%s-%s][^%s-%s])|([^%s-%s][%s-%s])|([%s-%s]$)|(^[%s-%s])' % \ (unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff), unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff), unichr(0xd800),unichr(0xdbff),unichr(0xdc00),unichr(0xdfff), ) input = re.sub(RE_XML_ILLEGAL, "", input) # ascii control characters input = re.sub(r"[\x01-\x1F\x7F]", "", input) return input
Unicode section adapted from Max Harper.