1、提取顶级域名信息
pip install tld
| from tld import get_tld |
| |
| get_tld("http://www.google.co.uk") |
| |
| |
| |
| get_tld("http://www.google.idontexist", fail_silently=True) |
| |
| from tld import get_tld |
| |
| res = get_tld("http://some.subdomain.google.co.uk", as_object=True) |
| |
| res |
| |
| |
| res.subdomain |
| |
| |
| res.domain |
| |
| |
| res.tld |
| |
| |
| res.fld |
| |
| |
| res.parsed_url |
| |
| |
| |
| |
| |
| |
| |
| from tld import get_tld, get_fld |
| |
| |
| get_tld("www.google.co.uk", fix_protocol=True) |
| |
| |
| |
| get_fld("www.google.co.uk", fix_protocol=True) |
| |
| from tld import parse_tld |
| |
| parse_tld('http://www.google.com') |
| |
| from tld import get_fld |
| |
| get_fld("http://www.google.co.uk") |
| |
| |
| get_fld("http://www.google.idontexist", fail_silently=True) |
| |
| from tld import is_tld |
| |
| is_tld('co.uk) |
| # True |
| |
| is_tld('uk') |
| # True |
| |
| is_tld('tld.doesnotexist') |
| # False |
| |
| is_tld('www.google.com') |
| # False |
| |
| |
| |
| |
| |
| from tld.utils import update_tld_names |
| |
| update_tld_names() |
2、对URL按照一定格式进行拆分
| from urllib.parse import urlparse |
| |
| info = urlparse( |
| "https://club.jd.com/comment/productPageComments.action?&productId=100000177748&score=0&sortType=5&page=1&pageSize=10&isShadowSku=0&fold=1" |
| ) |
| print(info) |
将url分成六个部分,返回一个包含6个字符串项目的元组:协议,位置,路径,参数,查询,判断。
| ParseResult( |
| scheme="https", |
| netloc="club.jd.com", |
| path="/comment/productPageComments.action", |
| params="", |
| query="&score=0&sortType=5&page=1&pageSize=10", |
| fragment="", |
| ) |
scheme是协议,netloc是域名服务器,path是路径,params是参数,query是查询,那么fragment是判断。
文章评论