AI摘要
正在生成中……
常用元字符:
|
|
. |
匹配除换行符以外的任意字符 |
\w |
匹配字母或数字或下划线 |
\s |
匹配任意的空白符 |
\n |
匹配一个换行符 |
\t |
匹配一个制表符 |
\W |
匹配非字母或数字或下划线 |
\D |
匹配非数字 |
\S |
匹配非空白符 |
^ |
匹配字符串的开始 |
$ |
匹配字符串的开始 |
a|b |
字符a或者字符b |
() |
匹配括号内的表达式, 也表示一个组 |
[…] |
匹配字符组中的字符 |
[^…] |
匹配除了字符组的所有字符 |
|
|
[a-zA-Z0-9_] 中括号里面”-“表示范围, 匹配数字字母下划线
量词: 控制前面的元字符出现的次数
|
|
* |
重复零次或更多次 |
+ |
重复一次或者更多次 |
? |
重复零次或一次 |
{n} |
重复n次 |
{n,} |
重复n次或更多次 |
{n,m} |
重复n次或更多次 |
贪婪匹配和惰性匹配
python的re模块使用
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
| import re
list = re.findall(r"\d+", "我的电话是:10086, 我女朋友电话是:10010") print(list)
it = re.finditer(r"\d+", "我的电话是:10086, 我女朋友电话是:10010") for i in it: print(i.group())
s = re.search(r"\d+", "我的电话是:10086, 我女朋友电话是:10010") print(s.group())
obj = re.compile(r"\d+")
ret = obj.finditer("我的电话是:10086, 我女朋友电话是:10010") for i in ret: print(i.group())
haha = obj.findall("发发发发, 你滴2093429035") print(haha)
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
| import re
s = """ <div class='jay'><span id='1'>苏大强</span></div> <div class='jj'><span id='2'>范德蒙</span></div> <div class='jolin'><span id='3'>埃蒂斯</span></div> <div class='sylar'><span id='4'>亚索论</span></div> <div class='tory'><span id='5'>鲁宾孙</span></div> """
obj = re.compile(r"<div class='.*?'><span id='(?P<id>\d+)'>(?P<waqu>.*?)</span></div>", re.S)
result = obj.finditer(s) for i in result: print(i.group("waqu")) print(i.group("id"))
|
例1
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
|
import requests import re import csv
url = "https://movie.douban.com/top250" header = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" } resp = requests.get(url, headers=header) page_content = resp.text
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)' r'</span>.*?<p class="">.*?<br>(?P<year>.*?)' r' .*?<span class="rating_num" property="v:average">(?P<score>.*?)' r'</span>.*?<span>(?P<num>.*?)人评价</span>',re.S)
result = obj.finditer(page_content) f = open("data.csv", mode="w",encoding="utf-8") csvwriter = csv.writer(f)
for i in result: dic = i.groupdict() dic['year'] = dic['year'].strip() csvwriter.writerow(dic.values()) f.close() print("over!")
|
例2
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
|
import requests import re
url = "https://www.dy2018.com/" header = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" } resp = requests.get(url,headers=header) resp.encoding = 'gb2312'
obj1 = re.compile(r'2024必看热片.*?<ul>(?P<ul>.*?)</ul>',re.S) obj2 = re.compile(r"<a href='(?P<href>.*?)'", re.S) obj3 = re.compile(r'◎片 名(?P<movie>.*?)<br />.*?<td style="WORD-WRAP: break-word" bgcolor="#fdfddf"><a href="(?P<bt>.*?)">',re.S) result1 = obj1.finditer(resp.text) child_href_list = [] for i in result1: ul = i.group('ul') result2 = obj2.finditer(ul) for ii in result2: child_href = url + ii.group("href").strip("/") child_href_list.append(child_href)
for href in child_href_list: child_resp = requests.get(href,headers=header) child_resp.encoding = 'gb2312' result3 = obj3.search(child_resp.text) print(result3.group("movie")) print(result3.group("bt"))
|