1 | # coding=utf-8
|
---|
2 |
|
---|
3 | import re
|
---|
4 |
|
---|
5 | SGML_TAG = ur"""
|
---|
6 | <!-- .*? --> # XML/SGML comment
|
---|
7 | | # -- OR --
|
---|
8 | <[!?/]?(?!\d)\w[-\.:\w]* # Start of tag/directive
|
---|
9 | ( # Attributes
|
---|
10 | [^>'"]* # - attribute name (+whitespace +equal sign)
|
---|
11 | ('[^']*'|"[^"]*") # - attribute value
|
---|
12 | )*
|
---|
13 | \s* # Spaces at the end
|
---|
14 | /? # Forward slash at the end of singleton tags
|
---|
15 | \s* # More spaces at the end
|
---|
16 | > # +End of tag/directive
|
---|
17 | """
|
---|
18 | SGML_TAG_RE = re.compile(SGML_TAG, re.UNICODE | re.VERBOSE | re.DOTALL)
|
---|
19 |
|
---|
20 | ORDINAL = ur"""
|
---|
21 | (?<!\S) # preceded by space
|
---|
22 | \d*\.
|
---|
23 | (?=\s[a-záčďéěíňóřšťúůýž]) # followed by space and lowercase letter
|
---|
24 | """
|
---|
25 | ORDINAL_RE = re.compile(ORDINAL, re.UNICODE | re.VERBOSE)
|
---|
26 |
|
---|
27 | COMPLEX_ABBR = ur"""
|
---|
28 | (?<!\w) [sr]\. (?=\s\d) # strana 1, rok 2000
|
---|
29 | |
|
---|
30 | (?<=\s) n\. (?=\sl\.) # náš
|
---|
31 | |
|
---|
32 | (?<=\sn\.\s) l\. # letopočet
|
---|
33 | |
|
---|
34 | (?<=\ss\s) r\. (?=\so\.) # ručení
|
---|
35 | |
|
---|
36 | (?<=\ss\sr\.\s) o\. # omezený
|
---|
37 | |
|
---|
38 | (?<=\s) a\. (?=\ss\.) # akciový
|
---|
39 | |
|
---|
40 | (?<=\sa\.\s) s\. # společnost
|
---|
41 | """
|
---|
42 | COMPLEX_ABBR_RE = re.compile(COMPLEX_ABBR, re.UNICODE | re.VERBOSE)
|
---|
43 |
|
---|
44 | WHITESPACE = ur"\s+"
|
---|
45 | WHITESPACE_RE = re.compile(WHITESPACE)
|
---|
46 |
|
---|
47 | DNS_HOST = ur"(([-a-z0-9]+\.)+[a-z]{2,})"
|
---|
48 |
|
---|
49 | URL = ur"""
|
---|
50 | (
|
---|
51 | # scheme://[user:password]
|
---|
52 | (ftps?|https?|file)://([-a-z0-9_;?&=](:[-a-z0-9_;?&=]*)?@)?
|
---|
53 | # or "www" without the scheme part
|
---|
54 | |www\.
|
---|
55 | )
|
---|
56 | # DNS host / localhost / IP
|
---|
57 | (""" + DNS_HOST + """
|
---|
58 | | localhost |
|
---|
59 | ([0-9]{1,3}\.){3}[0-9]{1,3})
|
---|
60 | # Port specification (optional)
|
---|
61 | (:[0-9]+)?
|
---|
62 | # Scheme specific extension (optional)
|
---|
63 | (/[-\w;/?:@=&\$_.+!*'(~#%,]*)?
|
---|
64 | """
|
---|
65 | URL_RE = re.compile(URL, re.VERBOSE | re.IGNORECASE | re.UNICODE)
|
---|
66 |
|
---|
67 | EMAIL = ur"[-a-z0-9._']+@" + DNS_HOST
|
---|
68 | EMAIL_RE = re.compile(EMAIL, re.IGNORECASE)
|
---|
69 |
|
---|
70 | HTMLENTITY = ur"&(#x?[0-9A-F]+|\w+);"
|
---|
71 | HTMLENTITY_RE = re.compile(HTMLENTITY)
|
---|
72 |
|
---|
73 | HASHTAG = ur"(?<!\w)#[A-Za-z]\w+"
|
---|
74 | HASHTAG_RE = re.compile(HASHTAG)
|
---|
75 |
|
---|
76 | DOTCOM = ur"""
|
---|
77 | (?<!\w)
|
---|
78 | ([-a-z0-9]+\.){1,2}(com|org|cz|sk|eu)
|
---|
79 | (?!\w)
|
---|
80 | """
|
---|
81 | DOTCOM_RE = re.compile(DOTCOM, re.IGNORECASE | re.VERBOSE)
|
---|
82 |
|
---|
83 | NUMBER = ur"""
|
---|
84 | (?<!\S)
|
---|
85 | [-+]?
|
---|
86 | (\d[\d,.]*\d | \d)
|
---|
87 | ([eE][-+][0-9]+)?
|
---|
88 | (?![-\w])
|
---|
89 | """
|
---|
90 | NUMBER_RE = re.compile(NUMBER, re.UNICODE | re.VERBOSE)
|
---|
91 |
|
---|
92 | ABBREVIATION = ur"""
|
---|
93 | (?<!\w) (aj|ap|apod|atd|CSc|čl|Čl|čs|Čs|čsl|Čsl|doc|Doc|dr|Dr|DrSc|gen|Gen|Ch|ing|Ing|JUDr|kl|Kl|kupř|Kupř|max|Max|Mgr|min|Min|mj|Mj|mjr|Mjr|MUDr|MVDr|např|Např|nar|Nar|npor|Npor|odd|Odd|PaedDr|Ph|PhDr|plk|Plk|popř|Popř|pozn|Pozn|pplk|Pplk|ppor|Ppor|prof|Prof|př|Př|příp|Příp|resp|Resp|RNDr|RSDr|sb|Sb|soc|Soc|spol|Spol|srov|Srov|st|stol|St|str|Str|sv|Sv|tab|Tab|tč|Tč|tel|Tel|tj|Tj|tř|Tř|tzn|Tzn|tzv|Tzv|ul|Ul|zkr|Zkr|zn|Zn|zvl|Zvl)\.
|
---|
94 | """
|
---|
95 | ABBREVIATION_RE = re.compile(ABBREVIATION, re.UNICODE | re.VERBOSE)
|
---|
96 |
|
---|
97 | USA = ur"""
|
---|
98 | (?<!\w)
|
---|
99 | ([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]\.)+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ](?!\w))?
|
---|
100 | """
|
---|
101 | USA_RE = re.compile(USA, re.UNICODE | re.VERBOSE)
|
---|
102 |
|
---|
103 | CLITIC_RE = re.compile(ur"(?<=\w)-li(?!\w)", re.UNICODE)
|
---|
104 |
|
---|
105 | WORD = ur"\w[\w-]*\w|\w"
|
---|
106 | WORD_RE = re.compile(WORD, re.UNICODE)
|
---|
107 |
|
---|
108 | MULTICHAR_PUNCTUATION = ur"([?!]+|'')"
|
---|
109 | MULTICHAR_PUNCTUATION_RE = re.compile(MULTICHAR_PUNCTUATION)
|
---|
110 |
|
---|
111 | SINGLECHAR_PUNCTUATION = ur"[\u0028\u005b\u007b\u0029\u005d\u007d\u2985\u2989\u3008\u298d\u300c\u2991\u3010\u2995\u3014\u2018\u169b\u201c\xab\u23b5\xbb\u0f3a\ufd3e\u29d9\u27e9\u276a\u276e\u2772\u29fd\u2986\u300b\u298a\u300f\u298e\u2992\u3017\u3018\u301b\u169c\u301f\u0f3d\u29da\u27e6\u2769\u27ea\u276d\u2771\u2775\u2983\u2987\u298b\u300a\u298f\u300e\u2993\u2997\u3016\u301a\u301e\u203a\u0f3c\u2046\u29db\u27e7\u2768\u27eb\u276c\u2770\u2774\u3019\u2984\u3009\u2988\u2996\u300d\u298c\u3011\u2990\u3015\u2994\u2019\u2998\u201d\u301d\u23b4\u2039\u0f3b\ufd3f\u2045\u29d8\u27e8\u276b\u276f\u2773\u29fc]"
|
---|
112 | SINGLECHAR_PUNCTUATION_RE = re.compile(SINGLECHAR_PUNCTUATION, re.UNICODE)
|
---|
113 |
|
---|
114 | ANY_SEQUENCE = ur"(.)\1*"
|
---|
115 | ANY_SEQUENCE_RE = re.compile(ANY_SEQUENCE)
|
---|
116 |
|
---|
117 | re_list = [
|
---|
118 | ('SGML_TAG', SGML_TAG_RE),
|
---|
119 | ('ORDINAL', ORDINAL_RE),
|
---|
120 | ('COMPLEX_ABBR', COMPLEX_ABBR_RE),
|
---|
121 | ('WHITESPACE', WHITESPACE_RE),
|
---|
122 | ('URL', URL_RE),
|
---|
123 | ('EMAIL', EMAIL_RE),
|
---|
124 | ('HTMLENTITY', HTMLENTITY_RE),
|
---|
125 | ('HASHTAG', HASHTAG_RE),
|
---|
126 | ('DOTCOM', DOTCOM_RE),
|
---|
127 | ('NUMBER', NUMBER_RE),
|
---|
128 | ('ABBREVIATION', ABBREVIATION_RE),
|
---|
129 | ('USA', USA_RE),
|
---|
130 | ('CLITIC', CLITIC_RE),
|
---|
131 | ('WORD', WORD_RE),
|
---|
132 | ('MULTICHAR_PUNCTUATION', MULTICHAR_PUNCTUATION_RE),
|
---|
133 | ('SINGLECHAR_PUNCTUATION', SINGLECHAR_PUNCTUATION_RE),
|
---|
134 | ('ANY_SEQUENCE', ANY_SEQUENCE_RE),
|
---|
135 | ]
|
---|