9.11. Syntax Use Cases¶
9.11.1. National Identification Numbers¶
>>> def pesel_check_digit(self):
... weights = (1, 3, 7, 9, 1, 3, 7, 9, 1, 3)
... check = sum(w * int(n) for w, n in zip(weights, self.pesel))
... return str((10 - check) % 10)







9.11.2. Dates¶
ISO Date:
>>> pattern = r'^\d{4}-\d{2}-\d{2}$'
US Long Date:
>>> pattern = r'^\w+ \d{2}, \d{4}$'
US Short Date:
>>> pattern = r'^\d{2}/\d{2}/\d{2}$'
9.11.3. Email¶
>>> pattern = r'^[a-zA-Z0-9][\w.+-]*@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]{2,20}$'
W3C HTML5 Standard 2 regexp for email field
>>> pattern = r"^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+(?:\.[a-zA-Z0-9-]+)*$"
9.11.4. URL¶
W3C standard for URL understanding
^(?=[^&])(?:(?<scheme>[^:/?#]+):)?(?://(?<authority>[^/?#]*))?
(?<path>[^?#]*)(?:\?(?<query>[^#]*))?(?:#(?<fragment>.*))?
>>> scheme = r'(?:(?<scheme>[^:/?#]+):)?'
>>> authority = r'(?://(?<authority>[^/?#]*))?'
>>> path = r'(?<path>[^?#]*)'
>>> query = r'(?:\?(?<query>[^#]*))?'
>>> fragment = r'(?:#(?<fragment>.*))?'
>>>
>>> pattern = f'^(?=[^&]){scheme}{authority}{path}{query}{fragment}'
>>>
>>> print(pattern)
^(?=[^&])(?:(?<scheme>[^:/?#]+):)?(?://(?<authority>[^/?#]*))?(?<path>[^?#]*)(?:\?(?<query>[^#]*))?(?:#(?<fragment>.*))?
W3C standard for URL parsing
/^\s*[a-z](?:[-a-z0-9\+\.])*:(?:\/\/(?:(?:%[0-9a-f][0-9a-f]|[-a-z0-9\._~
\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD\u30000-
\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD\u80000-
\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD\uD0000-
\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:])*@)?(?:\[(?:(?:(?:[0-9a-f]{1,4}:)
{6}(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4]
[0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}
)|::(?:[0-9a-f]{1,4}:){5}(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]
|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2
[0-4][0-9]|25[0-5])){3})|(?:[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){4}
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]
|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){3}(?:[0-9a-f]{1,4}:
[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.
(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]
{1,4}:){0,2}[0-9a-f]{1,4})?::(?:[0-9a-f]{1,4}:){2}(?:[0-9a-f]{1,4}:[0-9a-f]
{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|
[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]{1,4}:){0,3}
[0-9a-f]{1,4})?::[0-9a-f]{1,4}:(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|
[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]
[0-9]|2[0-4][0-9]|25[0-5])){3})|(?:(?:[0-9a-f]{1,4}:){0,4}[0-9a-f]{1,4})?::
(?:[0-9a-f]{1,4}:[0-9a-f]{1,4}|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|
25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3})|
(?:(?:[0-9a-f]{1,4}:){0,5}[0-9a-f]{1,4})?::[0-9a-f]{1,4}|(?:(?:[0-9a-f]
{1,4}:){0,6}[0-9a-f]{1,4})?::)|v[0-9a-f]+[-a-z0-9\._~!\$&\'\(\)\*\+,;
=:]+)\]|(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])(?:\.
(?:[0-9]|[1-9][0-9]|1[0-9][0-9]|2[0-4][0-9]|25[0-5])){3}|
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=@])*)(?::[0-9]*)?(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*|\/(?:(?:(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))+)(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*)?|(?:(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))+)(?:\/(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@]))*)*|(?!
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])))(?:\?(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])
|[\uE000-\uF8FF\uF0000-\uFFFFD|\u100000-\u10FFFD\/\?])*)?(?:\#(?:
(?:%[0-9a-f][0-9a-f]|[-a-z0-9\
._~\uA0-\uD7FF\uF900-\uFDCF\uFDF0-\uFFEF\u10000-\u1FFFD\u20000-\u2FFFD
\u30000-\u3FFFD\u40000-\u4FFFD\u50000-\u5FFFD\u60000-\u6FFFD\u70000-\u7FFFD
\u80000-\u8FFFD\u90000-\u9FFFD\uA0000-\uAFFFD\uB0000-\uBFFFD\uC0000-\uCFFFD
\uD0000-\uDFFFD\uE1000-\uEFFFD!\$&\'\(\)\*\+,;=:@])|[\/\?])*)?\s*$/i
9.11.5. Parsing URLs¶
Source 3
To parse a URL url into its component parts, the user agent must use the following steps:
Strip leading and trailing space characters from url.
Parse url in the manner defined by RFC 3986, with the following exceptions:
Add all characters with code points less than or equal to
U+0020
or greater than or equal toU+007F
to the<unreserved>
production.Add the characters
U+0022
,U+003C
,U+003E
,U+005B
...U+005E
,U+0060
, andU+007B
...U+007D
to the<unreserved>
productionAdd a single
U+0025
PERCENT SIGN character as a second alternative way of matching the<pct-encoded>
production, except when the<pct-encoded>
is used in the<reg-name>
production.Add the
U+0023
NUMBER SIGN character to the characters allowed in the<fragment>
production.
If url doesn't match the
<URI-reference>
production, even after the above changes are made to the ABNF definitions, then parsing the URL fails with an error. [RFC 3986] Otherwise, parsing url was successful; the components of the URL are substrings of url defined as follows:
- scheme¶
The substring matched by the
<scheme>
production, if any.- host¶
The substring matched by the
<host>
production, if any.- port¶
The substring matched by the
<port>
production, if any.- hostport¶
If there is a
<scheme>
component and a<port>
component and the port given by the<port>
component is different than the default port defined for the protocol given by the<scheme>
component, then<hostport>
is the substring that starts with the substring matched by the<host>
production and ends with the substring matched by the<port>
production, and includes the colon in between the two. Otherwise, it is the same as the<host>
component.- path¶
The substring matched by one of the following productions, if one of them was matched:
- path-abempty¶
- path-absolute¶
- path-noscheme¶
- path-rootless¶
- path-empty¶
- query¶
The substring matched by the
<query>
production, if any.- fragment¶
The substring matched by the
<fragment>
production, if any.- host-specific¶
The substring that follows the substring matched by the <authority> production, or the whole string if the
<authority>
production wasn't matched.
9.11.6. References¶
- 1
RFC 3696. Year: 2019. Retrieved: 2019-03-13. URL: https://datatracker.ietf.org/doc/html/rfc3696#section-3
- 2
W3C. Parsing Email. Year: 2019. Retrieved: 2019-03-13. URL: https://html.spec.whatwg.org/multipage/input.html#valid-e-mail-address
- 3
W3C. Parsing URLs. Year: 2019. Retrieved: 2019-03-13. URL: https://dev.w3.org/html5/spec-LC/urls.html#parsing-urls
9.11.7. Assignments¶
"""
* Assignment: RE Standards IsValidPesel
* Complexity: easy
* Lines of code: 4 lines
* Time: 5 min
English:
TODO: English Translation
X. Run doctests - all must succeed
Polish:
1. Napisz implementację `is_pesel_valid`
a. Temat walidacji Pesel jest zbyt trudny dla Regex
b. W tej funkcji użujemy prostego sprawdzenia r'^\d{11}$'
c. Już tylko taki kawałek kodu pozwoli na uniknięcie 80% błędów
2. Uruchom doctesty - wszystkie muszą się powieść
Tests:
>>> import sys; sys.tracebacklimit = 0
>>> is_pesel_valid('69072101234')
True
>>> is_pesel_valid('18220812345')
True
"""
import re
PATTERN = r'^\d{11}$'
def is_pesel_valid(pesel: str) -> bool:
...
"""
* Assignment: RE Standards IsPeselWoman
* Complexity: easy
* Lines of code: 3 lines
* Time: 5 min
English:
TODO: English Translation
Polish:
1. Napisz implementację `is_pesel_woman`
a. Pesel należy do kobiety, jeżeli przed ostatnia cyfra jest parzysta
a. Nie korzystaj z regex
2. Uruchom doctesty - wszystkie muszą się powieść
Tests:
>>> import sys; sys.tracebacklimit = 0
>>> is_pesel_woman(69072101234)
False
>>> is_pesel_woman(18220812345)
True
"""
PATTERN = r'^\d{11}$'
WOMAN = {0,2,4,6,8}
MAN = {1,3,5,7,9}
# type: Callable[[int], bool]
def is_pesel_woman(pesel):
"""
Check whether PESEL is woman's.
If the second to last number is even,
then PESEL is woman's, in other case PESEL is man's.
"""
...
"""
* Assignment: RE Standards PESEL
* Complexity: medium
* Lines of code: 0 lines
* Time: 5 min
* Warning: Do no write any code - **discussion only**
English:
TODO: English Translation
Polish:
1. Nie pisz kodu, przeprowadź tylko dyskusję
2. Zajmujemy się tylko peselami ludzi urodzonymi przed 2000 rokiem
3. Mając PESEL "69072101234"
a. Jakie wyrażenie może być na pierwszym miejscu w PESEL?
b. Jakie wyrażenie może być na drugim miejscu w PESEL?
c. Jakie wyrażenie może być na trzecim miejscu w PESEL?
d. Jakie wyrażenie może być na czwartym miejscu w PESEL?
e. Jakie wyrażenie może być na piątym miejscu w PESEL?
f. Jakie wyrażenie może być na szóstym miejscu w PESEL?
4. Co to jest suma kontrolna?
"""