1 | # coding=utf-8 |
---|
2 | |
---|
3 | import re |
---|
4 | |
---|
5 | SGML_TAG = ur""" |
---|
6 | <!-- .*? --> # XML/SGML comment |
---|
7 | | # -- OR -- |
---|
8 | <[!?/]?(?!\d)\w[-\.:\w]* # Start of tag/directive |
---|
9 | ( # Attributes |
---|
10 | [^>'"]* # - attribute name (+whitespace +equal sign) |
---|
11 | ('[^']*'|"[^"]*") # - attribute value |
---|
12 | )* |
---|
13 | \s* # Spaces at the end |
---|
14 | /? # Forward slash at the end of singleton tags |
---|
15 | \s* # More spaces at the end |
---|
16 | > # +End of tag/directive |
---|
17 | """ |
---|
18 | SGML_TAG_RE = re.compile(SGML_TAG, re.UNICODE | re.VERBOSE | re.DOTALL) |
---|
19 | |
---|
20 | WHITESPACE = ur"\s+" |
---|
21 | WHITESPACE_RE = re.compile(WHITESPACE) |
---|
22 | |
---|
23 | DNS_HOST = ur"(([-a-z0-9]+\.)+[a-z]{2,})" |
---|
24 | |
---|
25 | URL = ur""" |
---|
26 | ( |
---|
27 | # scheme://[user:password] |
---|
28 | (ftps?|https?|file)://([-a-z0-9_;?&=](:[-a-z0-9_;?&=]*)?@)? |
---|
29 | # or "www" without the scheme part |
---|
30 | |www\. |
---|
31 | ) |
---|
32 | # DNS host / localhost / IP |
---|
33 | (""" + DNS_HOST + """ |
---|
34 | | localhost | |
---|
35 | ([0-9]{1,3}\.){3}[0-9]{1,3}) |
---|
36 | # Port specification (optional) |
---|
37 | (:[0-9]+)? |
---|
38 | # Scheme specific extension (optional) |
---|
39 | (/[-\w;/?:@=&\$_.+!*'(~#%,]*)? |
---|
40 | """ |
---|
41 | URL_RE = re.compile(URL, re.VERBOSE | re.IGNORECASE | re.UNICODE) |
---|
42 | |
---|
43 | EMAIL = ur"[-a-z0-9._']+@" + DNS_HOST |
---|
44 | EMAIL_RE = re.compile(EMAIL, re.IGNORECASE) |
---|
45 | |
---|
46 | HTMLENTITY = ur"&(#x?[0-9A-F]+|\w+);" |
---|
47 | HTMLENTITY_RE = re.compile(HTMLENTITY) |
---|
48 | |
---|
49 | DOTCOM = ur""" |
---|
50 | (?<!\w) |
---|
51 | ([-a-z0-9]+\.){1,2}(com|org) |
---|
52 | (?!\w) |
---|
53 | """ |
---|
54 | DOTCOM_RE = re.compile(DOTCOM, re.IGNORECASE | re.VERBOSE) |
---|
55 | |
---|
56 | NUMBER = ur""" |
---|
57 | (?<!\S) |
---|
58 | [-+]? |
---|
59 | (\d[\d,.]*\d | \d) |
---|
60 | ([eE][-+][0-9]+)? |
---|
61 | (?![-\w]) |
---|
62 | """ |
---|
63 | NUMBER_RE = re.compile(NUMBER, re.UNICODE | re.VERBOSE) |
---|
64 | |
---|
65 | ABBREVIATION = ur""" |
---|
66 | (?<!\w) |
---|
67 | (?: |
---|
68 | #general |
---|
69 | co\.|etc\.|inc\.|ltd\.|dr\.|prof\.|jr\. |
---|
70 | ) |
---|
71 | """ |
---|
72 | ABBREVIATION_RE = re.compile(ABBREVIATION, re.UNICODE | re.VERBOSE) |
---|
73 | |
---|
74 | USA = ur""" |
---|
75 | (?<!\w) |
---|
76 | ([A-Z]\.)+([A-Z](?!\w))? |
---|
77 | """ |
---|
78 | USA_RE = re.compile(USA, re.UNICODE | re.VERBOSE) |
---|
79 | |
---|
80 | WORD = ur"\w[\w-]*\w|\w" |
---|
81 | WORD_RE = re.compile(WORD, re.UNICODE) |
---|
82 | |
---|
83 | #Standard word: ur"\w[\w-]*\w|\w" |
---|
84 | #Special for Ethiopian languages: An apostrophe is a part of a word in case |
---|
85 | # there is a letter on both sides of the apostrophe |
---|
86 | # and there is no number on any side of the apostrophe. |
---|
87 | WORD = ur"(?:(?!\d)(?:[\w-]|(?<!\d)(?<=\w)'(?=\w)(?!\d)))+" |
---|
88 | WORD_RE = re.compile(WORD, re.UNICODE) |
---|
89 | |
---|
90 | MULTICHAR_PUNCTUATION = ur"([?!]+|'')" |
---|
91 | MULTICHAR_PUNCTUATION_RE = re.compile(MULTICHAR_PUNCTUATION) |
---|
92 | |
---|
93 | SINGLECHAR_PUNCTUATION = ur"[\u0028\u005b\u007b\u0029\u005d\u007d\u2985\u2989\u3008\u298d\u300c\u2991\u3010\u2995\u3014\u2018\u169b\u201c\xab\u23b5\xbb\u0f3a\ufd3e\u29d9\u27e9\u276a\u276e\u2772\u29fd\u2986\u300b\u298a\u300f\u298e\u2992\u3017\u3018\u301b\u169c\u301f\u0f3d\u29da\u27e6\u2769\u27ea\u276d\u2771\u2775\u2983\u2987\u298b\u300a\u298f\u300e\u2993\u2997\u3016\u301a\u301e\u203a\u0f3c\u2046\u29db\u27e7\u2768\u27eb\u276c\u2770\u2774\u3019\u2984\u3009\u2988\u2996\u300d\u298c\u3011\u2990\u3015\u2994\u2019\u2998\u201d\u301d\u23b4\u2039\u0f3b\ufd3f\u2045\u29d8\u27e8\u276b\u276f\u2773\u29fc]" |
---|
94 | SINGLECHAR_PUNCTUATION_RE = re.compile(SINGLECHAR_PUNCTUATION, re.UNICODE) |
---|
95 | |
---|
96 | ANY_SEQUENCE = ur"(.)\1*" |
---|
97 | ANY_SEQUENCE_RE = re.compile(ANY_SEQUENCE) |
---|
98 | |
---|
99 | re_list = [ |
---|
100 | ('SGML_TAG', SGML_TAG_RE), |
---|
101 | ('WHITESPACE', WHITESPACE_RE), |
---|
102 | ('URL', URL_RE), |
---|
103 | ('EMAIL', EMAIL_RE), |
---|
104 | ('HTMLENTITY', HTMLENTITY_RE), |
---|
105 | ('DOTCOM', DOTCOM_RE), |
---|
106 | ('NUMBER', NUMBER_RE), |
---|
107 | ('ABBREVIATION', ABBREVIATION_RE), |
---|
108 | ('USA', USA_RE), |
---|
109 | ('WORD', WORD_RE), |
---|
110 | ('MULTICHAR_PUNCTUATION', MULTICHAR_PUNCTUATION_RE), |
---|
111 | ('SINGLECHAR_PUNCTUATION', SINGLECHAR_PUNCTUATION_RE), |
---|
112 | ('ANY_SEQUENCE', ANY_SEQUENCE_RE), |
---|
113 | ] |
---|