pyhon3抓取每日心语新闻
之前群里有个群友在群里炫耀他用python爬取一个网站,天天给他自己的网站发布最新的新闻,我问到了爬取的源站,故自己也花了几天时间也写了一个,并且把每日新闻发到钉钉上了.
系统:centos7.x(64位)
cat /root/soft_shell/weixin_sogou.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@IDE :PyCharm
@Author :rocdk890
@Date :2023/05/31 14:32
@Update : 2024/02/18 17:17
'''
import ssl
import socket
import os
import base64
import json
import datetime
import hashlib
import hmac
import time
import urllib.parse
import requests
import random
import re
import textwrap
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
def dingding_notify(message, access_token):
timestamp = str(round(time.time() * 1000))
# dingding 加签密钥
secret = 'SEC9xxxxxxxxxxxxxxxxxxxxxxxxxxx'
secret_enc = secret.encode('utf-8')
string_to_sign = '{}\n{}'.format(timestamp, secret)
string_to_sign_enc = string_to_sign.encode('utf-8')
hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
sign = urllib.parse.quote(base64.b64encode(hmac_code))
headers = {
"Content-Type": "application/json",
"Charset": "UTF-8"
}
data = {
"msgtype": "text",
"text": {
"content": message
}
}
url = 'https://oapi.dingtalk.com/robot/send?access_token=' + access_token + "×tamp=" + timestamp + "&sign=" + sign
response = requests.post(url, json=data, headers=headers)
if response.status_code != 200:
raise Exception("Failed to send dingding message")
def get_k_h(url):
b = int(random.random() * 100) + 1
a = url.find("url=")
url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1]
return url
def get_real_url(content):
url_text = re.findall("= \'(\S+?)\';", content, re.S)
best_url = ''.join(url_text)
return best_url
def html_url():
base_url = "https://weixin.sogou.com/weixin"
keyword = "每日心语简报"
query_params = {
"ie": "utf8",
"s_from": "input",
"_sug_": "y",
"_sug_type_": "",
"type": "1",
"query": keyword
}
url = base_url + "?" + "&".join([f"{key}={value}" for key, value in query_params.items()])
r = session.get(url, headers=headers, allow_redirects=True)
html = r.text
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all('a', uigs='account_article_0')
link_list = []
for link in links:
href = link.get('href')
full_url = "https://weixin.sogou.com" + href
link_list.append(full_url)
return link_list
def get_url():
# dingding token
access_token = "beb88xxxxxxxxxxxxxxxx"
url_list = html_url()
for link in url_list:
link = get_k_h(link)
response = session.get(link, headers=headers, allow_redirects=True)
final_url = response.url
# 获取真实url
response = session.get(final_url, headers=headers, allow_redirects=True)
content = response.text
real_url = get_real_url(content)
res = session.get(real_url, headers=headers)
cont_html = res.text
soup = BeautifulSoup(cont_html, 'html.parser')
target_section = soup.find('section',
style='padding-top: 10px;outline: 0px;max-width: 100%;box-sizing: border-box;border-color: rgb(252, 180, 43);visibility: visible;overflow-wrap: break-word !important;')
if target_section:
target_text = target_section.get_text()
form_message = re.sub(r'([;。])', r'\1\n', target_text)
formatted_message = re.sub(r'(\d+、)', r'\n\1', form_message)
reversed_message = ''.join(reversed(formatted_message))
reversed_message = re.sub(r';', r'\n;', reversed_message, count=11)
reversed_message = re.sub(r';', r'\n;', reversed_message, count=1)
message = ''.join(reversed(reversed_message))
dingding_notify(message, access_token)
else:
print("目标section未找到")
if __name__ == '__main__':
get_url()



评论: