Context Navigation

Back to HabitSystemV3

HabitSystemV3: unitok_Czech.py

File unitok_Czech.py, 4.5 KB (added by xsuchom2, 9 years ago)

Line
1	# coding=utf-8
2
3	import re
4
5	SGML_TAG = ur"""
6	<!-- .*? --> # XML/SGML comment
7	\| # -- OR --
8	<[!?/]?(?!\d)\w[-\.:\w]* # Start of tag/directive
9	( # Attributes
10	[^>'"]* # - attribute name (+whitespace +equal sign)
11	('[^']'\|"[^"]") # - attribute value
12	)*
13	\s* # Spaces at the end
14	/? # Forward slash at the end of singleton tags
15	\s* # More spaces at the end
16	> # +End of tag/directive
17	"""
18	SGML_TAG_RE = re.compile(SGML_TAG, re.UNICODE \| re.VERBOSE \| re.DOTALL)
19
20	ORDINAL = ur"""
21	(?<!\S) # preceded by space
22	\d*\.
23	(?=\s[a-záčďéěíňóřšťúůýž]) # followed by space and lowercase letter
24	"""
25	ORDINAL_RE = re.compile(ORDINAL, re.UNICODE \| re.VERBOSE)
26
27	COMPLEX_ABBR = ur"""
28	(?<!\w) [sr]\. (?=\s\d) # strana 1, rok 2000
29	\|
30	(?<=\s) n\. (?=\sl\.) # náš
31	\|
32	(?<=\sn\.\s) l\. # letopočet
33	\|
34	(?<=\ss\s) r\. (?=\so\.) # ručení
35	\|
36	(?<=\ss\sr\.\s) o\. # omezený
37	\|
38	(?<=\s) a\. (?=\ss\.) # akciový
39	\|
40	(?<=\sa\.\s) s\. # společnost
41	"""
42	COMPLEX_ABBR_RE = re.compile(COMPLEX_ABBR, re.UNICODE \| re.VERBOSE)
43
44	WHITESPACE = ur"\s+"
45	WHITESPACE_RE = re.compile(WHITESPACE)
46
47	DNS_HOST = ur"(([-a-z0-9]+\.)+[a-z]{2,})"
48
49	URL = ur"""
50	(
51	# scheme://[user:password]
52	(ftps?\|https?\|file)://([-a-z0-9_;?&=](:[-a-z0-9_;?&=]*)?@)?
53	# or "www" without the scheme part
54	\|www\.
55	)
56	# DNS host / localhost / IP
57	(""" + DNS_HOST + """
58	\| localhost \|
59	([0-9]{1,3}\.){3}[0-9]{1,3})
60	# Port specification (optional)
61	(:[0-9]+)?
62	# Scheme specific extension (optional)
63	(/[-\w;/?:@=&\$_.+!'(~#%,])?
64	"""
65	URL_RE = re.compile(URL, re.VERBOSE \| re.IGNORECASE \| re.UNICODE)
66
67	EMAIL = ur"[-a-z0-9._']+@" + DNS_HOST
68	EMAIL_RE = re.compile(EMAIL, re.IGNORECASE)
69
70	HTMLENTITY = ur"&(#x?[0-9A-F]+\|\w+);"
71	HTMLENTITY_RE = re.compile(HTMLENTITY)
72
73	HASHTAG = ur"(?<!\w)#[A-Za-z]\w+"
74	HASHTAG_RE = re.compile(HASHTAG)
75
76	DOTCOM = ur"""
77	(?<!\w)
78	([-a-z0-9]+\.){1,2}(com\|org\|cz\|sk\|eu)
79	(?!\w)
80	"""
81	DOTCOM_RE = re.compile(DOTCOM, re.IGNORECASE \| re.VERBOSE)
82
83	NUMBER = ur"""
84	(?<!\S)
85	[-+]?
86	(\d[\d,.]*\d \| \d)
87	([eE][-+][0-9]+)?
88	(?![-\w])
89	"""
90	NUMBER_RE = re.compile(NUMBER, re.UNICODE \| re.VERBOSE)
91
92	ABBREVIATION = ur"""
93	(?<!\w) (aj\|ap\|apod\|atd\|CSc\|čl\|Čl\|čs\|Čs\|čsl\|Čsl\|doc\|Doc\|dr\|Dr\|DrSc\|gen\|Gen\|Ch\|ing\|Ing\|JUDr\|kl\|Kl\|kupř\|Kupř\|max\|Max\|Mgr\|min\|Min\|mj\|Mj\|mjr\|Mjr\|MUDr\|MVDr\|např\|Např\|nar\|Nar\|npor\|Npor\|odd\|Odd\|PaedDr\|Ph\|PhDr\|plk\|Plk\|popř\|Popř\|pozn\|Pozn\|pplk\|Pplk\|ppor\|Ppor\|prof\|Prof\|př\|Př\|příp\|Příp\|resp\|Resp\|RNDr\|RSDr\|sb\|Sb\|soc\|Soc\|spol\|Spol\|srov\|Srov\|st\|stol\|St\|str\|Str\|sv\|Sv\|tab\|Tab\|tč\|Tč\|tel\|Tel\|tj\|Tj\|tř\|Tř\|tzn\|Tzn\|tzv\|Tzv\|ul\|Ul\|zkr\|Zkr\|zn\|Zn\|zvl\|Zvl)\.
94	"""
95	ABBREVIATION_RE = re.compile(ABBREVIATION, re.UNICODE \| re.VERBOSE)
96
97	USA = ur"""
98	(?<!\w)
99	([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ]\.)+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ](?!\w))?
100	"""
101	USA_RE = re.compile(USA, re.UNICODE \| re.VERBOSE)
102
103	CLITIC_RE = re.compile(ur"(?<=\w)-li(?!\w)", re.UNICODE)
104
105	WORD = ur"\w[\w-]*\w\|\w"
106	WORD_RE = re.compile(WORD, re.UNICODE)
107
108	MULTICHAR_PUNCTUATION = ur"([?!]+\|'')"
109	MULTICHAR_PUNCTUATION_RE = re.compile(MULTICHAR_PUNCTUATION)
110
111	SINGLECHAR_PUNCTUATION = ur"[\u0028\u005b\u007b\u0029\u005d\u007d\u2985\u2989\u3008\u298d\u300c\u2991\u3010\u2995\u3014\u2018\u169b\u201c\xab\u23b5\xbb\u0f3a\ufd3e\u29d9\u27e9\u276a\u276e\u2772\u29fd\u2986\u300b\u298a\u300f\u298e\u2992\u3017\u3018\u301b\u169c\u301f\u0f3d\u29da\u27e6\u2769\u27ea\u276d\u2771\u2775\u2983\u2987\u298b\u300a\u298f\u300e\u2993\u2997\u3016\u301a\u301e\u203a\u0f3c\u2046\u29db\u27e7\u2768\u27eb\u276c\u2770\u2774\u3019\u2984\u3009\u2988\u2996\u300d\u298c\u3011\u2990\u3015\u2994\u2019\u2998\u201d\u301d\u23b4\u2039\u0f3b\ufd3f\u2045\u29d8\u27e8\u276b\u276f\u2773\u29fc]"
112	SINGLECHAR_PUNCTUATION_RE = re.compile(SINGLECHAR_PUNCTUATION, re.UNICODE)
113
114	ANY_SEQUENCE = ur"(.)\1*"
115	ANY_SEQUENCE_RE = re.compile(ANY_SEQUENCE)
116
117	re_list = [
118	('SGML_TAG', SGML_TAG_RE),
119	('ORDINAL', ORDINAL_RE),
120	('COMPLEX_ABBR', COMPLEX_ABBR_RE),
121	('WHITESPACE', WHITESPACE_RE),
122	('URL', URL_RE),
123	('EMAIL', EMAIL_RE),
124	('HTMLENTITY', HTMLENTITY_RE),
125	('HASHTAG', HASHTAG_RE),
126	('DOTCOM', DOTCOM_RE),
127	('NUMBER', NUMBER_RE),
128	('ABBREVIATION', ABBREVIATION_RE),
129	('USA', USA_RE),
130	('CLITIC', CLITIC_RE),
131	('WORD', WORD_RE),
132	('MULTICHAR_PUNCTUATION', MULTICHAR_PUNCTUATION_RE),
133	('SINGLECHAR_PUNCTUATION', SINGLECHAR_PUNCTUATION_RE),
134	('ANY_SEQUENCE', ANY_SEQUENCE_RE),
135	]

Download in other formats:

Original Format