pyhon3抓取每日心语新闻
之前群里有个群友在群里炫耀他用python爬取一个网站,天天给他自己的网站发布最新的新闻,我问到了爬取的源站,故自己也花了几天时间也写了一个,并且把每日新闻发到钉钉上了.
系统:centos7.x(64位)
cat /root/soft_shell/weixin_sogou.py
#!/usr/bin/env python # -*- coding: UTF-8 -*- ''' @IDE :PyCharm @Author :rocdk890 @Date :2023/05/31 14:32 @Update : 2024/02/18 17:17 ''' import ssl import socket import os import base64 import json import datetime import hashlib import hmac import time import urllib.parse import requests import random import re import textwrap from bs4 import BeautifulSoup from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" } def dingding_notify(message, access_token): timestamp = str(round(time.time() * 1000)) # dingding 加签密钥 secret = 'SEC9xxxxxxxxxxxxxxxxxxxxxxxxxxx' secret_enc = secret.encode('utf-8') string_to_sign = '{}\n{}'.format(timestamp, secret) string_to_sign_enc = string_to_sign.encode('utf-8') hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest() sign = urllib.parse.quote(base64.b64encode(hmac_code)) headers = { "Content-Type": "application/json", "Charset": "UTF-8" } data = { "msgtype": "text", "text": { "content": message } } url = 'https://oapi.dingtalk.com/robot/send?access_token=' + access_token + "×tamp=" + timestamp + "&sign=" + sign response = requests.post(url, json=data, headers=headers) if response.status_code != 200: raise Exception("Failed to send dingding message") def get_k_h(url): b = int(random.random() * 100) + 1 a = url.find("url=") url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1] return url def get_real_url(content): url_text = re.findall("= \'(\S+?)\';", content, re.S) best_url = ''.join(url_text) return best_url def html_url(): base_url = "https://weixin.sogou.com/weixin" keyword = "每日心语简报" query_params = { "ie": "utf8", "s_from": "input", "_sug_": "y", "_sug_type_": "", "type": "1", "query": keyword } url = base_url + "?" + "&".join([f"{key}={value}" for key, value in query_params.items()]) r = session.get(url, headers=headers, allow_redirects=True) html = r.text soup = BeautifulSoup(html, "html.parser") links = soup.find_all('a', uigs='account_article_0') link_list = [] for link in links: href = link.get('href') full_url = "https://weixin.sogou.com" + href link_list.append(full_url) return link_list def get_url(): # dingding token access_token = "beb88xxxxxxxxxxxxxxxx" url_list = html_url() for link in url_list: link = get_k_h(link) response = session.get(link, headers=headers, allow_redirects=True) final_url = response.url # 获取真实url response = session.get(final_url, headers=headers, allow_redirects=True) content = response.text real_url = get_real_url(content) res = session.get(real_url, headers=headers) cont_html = res.text soup = BeautifulSoup(cont_html, 'html.parser') target_section = soup.find('section', style='padding-top: 10px;outline: 0px;max-width: 100%;box-sizing: border-box;border-color: rgb(252, 180, 43);visibility: visible;overflow-wrap: break-word !important;') if target_section: target_text = target_section.get_text() form_message = re.sub(r'([;。])', r'\1\n', target_text) formatted_message = re.sub(r'(\d+、)', r'\n\1', form_message) reversed_message = ''.join(reversed(formatted_message)) reversed_message = re.sub(r';', r'\n;', reversed_message, count=11) reversed_message = re.sub(r';', r'\n;', reversed_message, count=1) message = ''.join(reversed(reversed_message)) dingding_notify(message, access_token) else: print("目标section未找到") if __name__ == '__main__': get_url()
评论: