| 1 | # coding=utf-8
|
|---|
| 2 |
|
|---|
| 3 | import re
|
|---|
| 4 |
|
|---|
| 5 | SGML_TAG = ur"""
|
|---|
| 6 | <!-- .*? --> # XML/SGML comment
|
|---|
| 7 | | # -- OR --
|
|---|
| 8 | <[!?/]?(?!\d)\w[-\.:\w]* # Start of tag/directive
|
|---|
| 9 | ( # Attributes
|
|---|
| 10 | [^>'"]* # - attribute name (+whitespace +equal sign)
|
|---|
| 11 | ('[^']*'|"[^"]*") # - attribute value
|
|---|
| 12 | )*
|
|---|
| 13 | \s* # Spaces at the end
|
|---|
| 14 | /? # Forward slash at the end of singleton tags
|
|---|
| 15 | \s* # More spaces at the end
|
|---|
| 16 | > # +End of tag/directive
|
|---|
| 17 | """
|
|---|
| 18 | SGML_TAG_RE = re.compile(SGML_TAG, re.UNICODE | re.VERBOSE | re.DOTALL)
|
|---|
| 19 |
|
|---|
| 20 | WHITESPACE = ur"\s+"
|
|---|
| 21 | WHITESPACE_RE = re.compile(WHITESPACE)
|
|---|
| 22 |
|
|---|
| 23 | DNS_HOST = ur"(([-a-z0-9]+\.)+[a-z]{2,})"
|
|---|
| 24 |
|
|---|
| 25 | URL = ur"""
|
|---|
| 26 | (
|
|---|
| 27 | # scheme://[user:password]
|
|---|
| 28 | (ftps?|https?|file)://([-a-z0-9_;?&=](:[-a-z0-9_;?&=]*)?@)?
|
|---|
| 29 | # or "www" without the scheme part
|
|---|
| 30 | |www\.
|
|---|
| 31 | )
|
|---|
| 32 | # DNS host / localhost / IP
|
|---|
| 33 | (""" + DNS_HOST + """
|
|---|
| 34 | | localhost |
|
|---|
| 35 | ([0-9]{1,3}\.){3}[0-9]{1,3})
|
|---|
| 36 | # Port specification (optional)
|
|---|
| 37 | (:[0-9]+)?
|
|---|
| 38 | # Scheme specific extension (optional)
|
|---|
| 39 | (/[-\w;/?:@=&\$_.+!*'(~#%,]*)?
|
|---|
| 40 | """
|
|---|
| 41 | URL_RE = re.compile(URL, re.VERBOSE | re.IGNORECASE | re.UNICODE)
|
|---|
| 42 |
|
|---|
| 43 | EMAIL = ur"[-a-z0-9._']+@" + DNS_HOST
|
|---|
| 44 | EMAIL_RE = re.compile(EMAIL, re.IGNORECASE)
|
|---|
| 45 |
|
|---|
| 46 | HTMLENTITY = ur"&(#x?[0-9A-F]+|\w+);"
|
|---|
| 47 | HTMLENTITY_RE = re.compile(HTMLENTITY)
|
|---|
| 48 |
|
|---|
| 49 | DOTCOM = ur"""
|
|---|
| 50 | (?<!\w)
|
|---|
| 51 | ([-a-z0-9]+\.){1,2}(com|org)
|
|---|
| 52 | (?!\w)
|
|---|
| 53 | """
|
|---|
| 54 | DOTCOM_RE = re.compile(DOTCOM, re.IGNORECASE | re.VERBOSE)
|
|---|
| 55 |
|
|---|
| 56 | NUMBER = ur"""
|
|---|
| 57 | (?<!\S)
|
|---|
| 58 | [-+]?
|
|---|
| 59 | (\d[\d,.]*\d | \d)
|
|---|
| 60 | ([eE][-+][0-9]+)?
|
|---|
| 61 | (?![-\w])
|
|---|
| 62 | """
|
|---|
| 63 | NUMBER_RE = re.compile(NUMBER, re.UNICODE | re.VERBOSE)
|
|---|
| 64 |
|
|---|
| 65 | ABBREVIATION = ur"""
|
|---|
| 66 | (?<!\w)
|
|---|
| 67 | (?:
|
|---|
| 68 | #general
|
|---|
| 69 | co\.|etc\.|inc\.|ltd\.|dr\.|prof\.|jr\.
|
|---|
| 70 | )
|
|---|
| 71 | """
|
|---|
| 72 | ABBREVIATION_RE = re.compile(ABBREVIATION, re.UNICODE | re.VERBOSE)
|
|---|
| 73 |
|
|---|
| 74 | USA = ur"""
|
|---|
| 75 | (?<!\w)
|
|---|
| 76 | ([A-Z]\.)+([A-Z](?!\w))?
|
|---|
| 77 | """
|
|---|
| 78 | USA_RE = re.compile(USA, re.UNICODE | re.VERBOSE)
|
|---|
| 79 |
|
|---|
| 80 | WORD = ur"\w[\w-]*\w|\w"
|
|---|
| 81 | WORD_RE = re.compile(WORD, re.UNICODE)
|
|---|
| 82 |
|
|---|
| 83 | #Standard word: ur"\w[\w-]*\w|\w"
|
|---|
| 84 | #Special for Ethiopian languages: An apostrophe is a part of a word in case
|
|---|
| 85 | # there is a letter on both sides of the apostrophe
|
|---|
| 86 | # and there is no number on any side of the apostrophe.
|
|---|
| 87 | WORD = ur"(?:(?!\d)(?:[\w-]|(?<!\d)(?<=\w)'(?=\w)(?!\d)))+"
|
|---|
| 88 | WORD_RE = re.compile(WORD, re.UNICODE)
|
|---|
| 89 |
|
|---|
| 90 | MULTICHAR_PUNCTUATION = ur"([?!]+|'')"
|
|---|
| 91 | MULTICHAR_PUNCTUATION_RE = re.compile(MULTICHAR_PUNCTUATION)
|
|---|
| 92 |
|
|---|
| 93 | SINGLECHAR_PUNCTUATION = ur"[\u0028\u005b\u007b\u0029\u005d\u007d\u2985\u2989\u3008\u298d\u300c\u2991\u3010\u2995\u3014\u2018\u169b\u201c\xab\u23b5\xbb\u0f3a\ufd3e\u29d9\u27e9\u276a\u276e\u2772\u29fd\u2986\u300b\u298a\u300f\u298e\u2992\u3017\u3018\u301b\u169c\u301f\u0f3d\u29da\u27e6\u2769\u27ea\u276d\u2771\u2775\u2983\u2987\u298b\u300a\u298f\u300e\u2993\u2997\u3016\u301a\u301e\u203a\u0f3c\u2046\u29db\u27e7\u2768\u27eb\u276c\u2770\u2774\u3019\u2984\u3009\u2988\u2996\u300d\u298c\u3011\u2990\u3015\u2994\u2019\u2998\u201d\u301d\u23b4\u2039\u0f3b\ufd3f\u2045\u29d8\u27e8\u276b\u276f\u2773\u29fc]"
|
|---|
| 94 | SINGLECHAR_PUNCTUATION_RE = re.compile(SINGLECHAR_PUNCTUATION, re.UNICODE)
|
|---|
| 95 |
|
|---|
| 96 | ANY_SEQUENCE = ur"(.)\1*"
|
|---|
| 97 | ANY_SEQUENCE_RE = re.compile(ANY_SEQUENCE)
|
|---|
| 98 |
|
|---|
| 99 | re_list = [
|
|---|
| 100 | ('SGML_TAG', SGML_TAG_RE),
|
|---|
| 101 | ('WHITESPACE', WHITESPACE_RE),
|
|---|
| 102 | ('URL', URL_RE),
|
|---|
| 103 | ('EMAIL', EMAIL_RE),
|
|---|
| 104 | ('HTMLENTITY', HTMLENTITY_RE),
|
|---|
| 105 | ('DOTCOM', DOTCOM_RE),
|
|---|
| 106 | ('NUMBER', NUMBER_RE),
|
|---|
| 107 | ('ABBREVIATION', ABBREVIATION_RE),
|
|---|
| 108 | ('USA', USA_RE),
|
|---|
| 109 | ('WORD', WORD_RE),
|
|---|
| 110 | ('MULTICHAR_PUNCTUATION', MULTICHAR_PUNCTUATION_RE),
|
|---|
| 111 | ('SINGLECHAR_PUNCTUATION', SINGLECHAR_PUNCTUATION_RE),
|
|---|
| 112 | ('ANY_SEQUENCE', ANY_SEQUENCE_RE),
|
|---|
| 113 | ]
|
|---|