HabitSystemV3: unitok_Czech.py

File unitok_Czech.py, 4.5 KB (added by xsuchom2, 7 years ago)
Line 
1# coding=utf-8
2
3import re
4
5SGML_TAG = ur"""
6    <!-- .*? -->                # XML/SGML comment
7    |                           # -- OR --
8    <[!?/]?(?!\d)\w[-\.:\w]*    # Start of tag/directive
9    (                           # Attributes
10        [^>'"]*                 # - attribute name (+whitespace +equal sign)
11        ('[^']*'|"[^"]*")       # - attribute value
12    )*
13    \s*                         # Spaces at the end
14    /?                          # Forward slash at the end of singleton tags
15    \s*                         # More spaces at the end
16    >                           # +End of tag/directive
17"""
18SGML_TAG_RE = re.compile(SGML_TAG, re.UNICODE | re.VERBOSE | re.DOTALL)
19
20ORDINAL = ur"""
21(?<!\S) # preceded by space
22    \d*\.
23(?=\s[a-záčďéěíňóřšťúůýž]) # followed by space and lowercase letter
24"""
25ORDINAL_RE = re.compile(ORDINAL, re.UNICODE | re.VERBOSE)
26
27COMPLEX_ABBR = ur"""
28(?<!\w) [sr]\. (?=\s\d) # strana 1, rok 2000
29|
30(?<=\s) n\. (?=\sl\.) # náš
31|
32(?<=\sn\.\s) l\. # letopočet
33|
34(?<=\ss\s) r\. (?=\so\.) # ručení
35|
36(?<=\ss\sr\.\s) o\. # omezený
37|
38(?<=\s) a\. (?=\ss\.) # akciový
39|
40(?<=\sa\.\s) s\. # společnost
41"""
42COMPLEX_ABBR_RE = re.compile(COMPLEX_ABBR, re.UNICODE | re.VERBOSE)
43
44WHITESPACE = ur"\s+"
45WHITESPACE_RE = re.compile(WHITESPACE)
46
47DNS_HOST = ur"(([-a-z0-9]+\.)+[a-z]{2,})"
48
49URL = ur"""
50    (
51    # scheme://[user:password]
52    (ftps?|https?|file)://([-a-z0-9_;?&=](:[-a-z0-9_;?&=]*)?@)?
53    # or "www" without the scheme part
54    |www\.
55    )
56    # DNS host / localhost / IP
57    (""" + DNS_HOST + """
58    | localhost |
59    ([0-9]{1,3}\.){3}[0-9]{1,3})
60    # Port specification (optional)
61    (:[0-9]+)?
62    # Scheme specific extension (optional)
63    (/[-\w;/?:@=&\$_.+!*'(~#%,]*)?
64"""
65URL_RE = re.compile(URL, re.VERBOSE | re.IGNORECASE | re.UNICODE)
66
67EMAIL = ur"[-a-z0-9._']+@" + DNS_HOST
68EMAIL_RE = re.compile(EMAIL, re.IGNORECASE)
69
70HTMLENTITY = ur"&(#x?[0-9A-F]+|\w+);"
71HTMLENTITY_RE = re.compile(HTMLENTITY)
72
73HASHTAG = ur"(?<!\w)#[A-Za-z]\w+"
74HASHTAG_RE = re.compile(HASHTAG)
75
76DOTCOM = ur"""
77(?<!\w)
78    ([-a-z0-9]+\.){1,2}(com|org|cz|sk|eu)
79(?!\w)
80"""
81DOTCOM_RE = re.compile(DOTCOM, re.IGNORECASE | re.VERBOSE)
82
83NUMBER = ur"""
84(?<!\S)
85    [-+]?
86    (\d[\d,.]*\d | \d)
87    ([eE][-+][0-9]+)?
88(?![-\w])
89"""
90NUMBER_RE = re.compile(NUMBER, re.UNICODE | re.VERBOSE)
91
92ABBREVIATION = ur"""
93(?<!\w) (aj|ap|apod|atd|CSc|čl|Čl|čs|Čs|čsl|Čsl|doc|Doc|dr|Dr|DrSc|gen|Gen|Ch|ing|Ing|JUDr|kl|Kl|kupř|Kupř|max|Max|Mgr|min|Min|mj|Mj|mjr|Mjr|MUDr|MVDr|např|Např|nar|Nar|npor|Npor|odd|Odd|PaedDr|Ph|PhDr|plk|Plk|popř|Popř|pozn|Pozn|pplk|Pplk|ppor|Ppor|prof|Prof|př|Př|příp|Příp|resp|Resp|RNDr|RSDr|sb|Sb|soc|Soc|spol|Spol|srov|Srov|st|stol|St|str|Str|sv|Sv|tab|Tab|tč|Tč|tel|Tel|tj|Tj|tř|Tř|tzn|Tzn|tzv|Tzv|ul|Ul|zkr|Zkr|zn|Zn|zvl|Zvl)\.
94"""
95ABBREVIATION_RE = re.compile(ABBREVIATION, re.UNICODE | re.VERBOSE)
96
97USA = ur"""
98(?<!\w)
99    ([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]\.)+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ](?!\w))?
100"""
101USA_RE = re.compile(USA, re.UNICODE | re.VERBOSE)
102
103CLITIC_RE = re.compile(ur"(?<=\w)-li(?!\w)", re.UNICODE)
104
105WORD = ur"\w[\w-]*\w|\w"
106WORD_RE = re.compile(WORD, re.UNICODE)
107
108MULTICHAR_PUNCTUATION = ur"([?!]+|'')"
109MULTICHAR_PUNCTUATION_RE = re.compile(MULTICHAR_PUNCTUATION)
110
111SINGLECHAR_PUNCTUATION = ur"[\u0028\u005b\u007b\u0029\u005d\u007d\u2985\u2989\u3008\u298d\u300c\u2991\u3010\u2995\u3014\u2018\u169b\u201c\xab\u23b5\xbb\u0f3a\ufd3e\u29d9\u27e9\u276a\u276e\u2772\u29fd\u2986\u300b\u298a\u300f\u298e\u2992\u3017\u3018\u301b\u169c\u301f\u0f3d\u29da\u27e6\u2769\u27ea\u276d\u2771\u2775\u2983\u2987\u298b\u300a\u298f\u300e\u2993\u2997\u3016\u301a\u301e\u203a\u0f3c\u2046\u29db\u27e7\u2768\u27eb\u276c\u2770\u2774\u3019\u2984\u3009\u2988\u2996\u300d\u298c\u3011\u2990\u3015\u2994\u2019\u2998\u201d\u301d\u23b4\u2039\u0f3b\ufd3f\u2045\u29d8\u27e8\u276b\u276f\u2773\u29fc]"
112SINGLECHAR_PUNCTUATION_RE = re.compile(SINGLECHAR_PUNCTUATION, re.UNICODE)
113
114ANY_SEQUENCE = ur"(.)\1*"
115ANY_SEQUENCE_RE = re.compile(ANY_SEQUENCE)
116
117re_list = [
118    ('SGML_TAG', SGML_TAG_RE),
119    ('ORDINAL', ORDINAL_RE),
120    ('COMPLEX_ABBR', COMPLEX_ABBR_RE),
121    ('WHITESPACE', WHITESPACE_RE),
122    ('URL', URL_RE),
123    ('EMAIL', EMAIL_RE),
124    ('HTMLENTITY', HTMLENTITY_RE),
125    ('HASHTAG', HASHTAG_RE),
126    ('DOTCOM', DOTCOM_RE),
127    ('NUMBER', NUMBER_RE),
128    ('ABBREVIATION', ABBREVIATION_RE),
129    ('USA', USA_RE),
130    ('CLITIC', CLITIC_RE),
131    ('WORD', WORD_RE),
132    ('MULTICHAR_PUNCTUATION', MULTICHAR_PUNCTUATION_RE),
133    ('SINGLECHAR_PUNCTUATION', SINGLECHAR_PUNCTUATION_RE),
134    ('ANY_SEQUENCE', ANY_SEQUENCE_RE),
135]