pyhon3抓取每日心语新闻

post by rocdk890 / 2024-2-18 18:04 Sunday linux技术
   之前群里有个群友在群里炫耀他用python爬取一个网站,天天给他自己的网站发布最新的新闻,我问到了爬取的源站,故自己也花了几天时间也写了一个,并且把每日新闻发到钉钉上了.
   系统:centos7.x(64位)

cat /root/soft_shell/weixin_sogou.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@IDE     :PyCharm
@Author  :rocdk890
@Date    :2023/05/31 14:32
@Update  : 2024/02/18 17:17
'''

import ssl
import socket
import os
import base64
import json
import datetime
import hashlib
import hmac
import time
import urllib.parse
import requests
import random
import re
import textwrap
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry


headers = {
       "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

def dingding_notify(message, access_token):
       timestamp = str(round(time.time() * 1000))
       # dingding 加签密钥
       secret = 'SEC9xxxxxxxxxxxxxxxxxxxxxxxxxxx'
       secret_enc = secret.encode('utf-8')
       string_to_sign = '{}\n{}'.format(timestamp, secret)
       string_to_sign_enc = string_to_sign.encode('utf-8')
       hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
       sign = urllib.parse.quote(base64.b64encode(hmac_code))

       headers = {
              "Content-Type": "application/json",
              "Charset": "UTF-8"
       }
       data = {
              "msgtype": "text",
              "text": {
                     "content": message
              }
       }
       url = 'https://oapi.dingtalk.com/robot/send?access_token=' + access_token + "&timestamp=" + timestamp + "&sign=" + sign
       response = requests.post(url, json=data, headers=headers)
       if response.status_code != 200:
              raise Exception("Failed to send dingding message")

def get_k_h(url):
       b = int(random.random() * 100) + 1
       a = url.find("url=")
       url = url + "&k=" + str(b) + "&h=" + url[a + 4 + 21 + b: a + 4 + 21 + b + 1]
       return url


def get_real_url(content):
       url_text = re.findall("= \'(\S+?)\';", content, re.S)
       best_url = ''.join(url_text)
       return best_url


def html_url():
       base_url = "https://weixin.sogou.com/weixin"
       keyword = "每日心语简报"

       query_params = {
              "ie": "utf8",
              "s_from": "input",
              "_sug_": "y",
              "_sug_type_": "",
              "type": "1",
              "query": keyword
       }

       url = base_url + "?" + "&".join([f"{key}={value}" for key, value in query_params.items()])
       r = session.get(url, headers=headers, allow_redirects=True)
       html = r.text

       soup = BeautifulSoup(html, "html.parser")
       links = soup.find_all('a', uigs='account_article_0')

       link_list = []
       for link in links:
              href = link.get('href')
              full_url = "https://weixin.sogou.com" + href
              link_list.append(full_url)

       return link_list


def get_url():
       # dingding token
       access_token = "beb88xxxxxxxxxxxxxxxx"
       url_list = html_url()
       for link in url_list:
              link = get_k_h(link)
              response = session.get(link, headers=headers, allow_redirects=True)
              final_url = response.url

              # 获取真实url
              response = session.get(final_url, headers=headers, allow_redirects=True)
              content = response.text
              real_url = get_real_url(content)

              res = session.get(real_url, headers=headers)
              cont_html = res.text
              soup = BeautifulSoup(cont_html, 'html.parser')
              target_section = soup.find('section',
                                         style='padding-top: 10px;outline: 0px;max-width: 100%;box-sizing: border-box;border-color: rgb(252, 180, 43);visibility: visible;overflow-wrap: break-word !important;')

              if target_section:
                     target_text = target_section.get_text()
                     form_message = re.sub(r'([;。])', r'\1\n', target_text)
                     formatted_message = re.sub(r'(\d+、)', r'\n\1', form_message)
                     reversed_message = ''.join(reversed(formatted_message))
                     reversed_message = re.sub(r';', r'\n;', reversed_message, count=11)
                     reversed_message = re.sub(r';', r'\n;', reversed_message, count=1)
                     message = ''.join(reversed(reversed_message))
                     dingding_notify(message, access_token)
              else:
                     print("目标section未找到")


if __name__ == '__main__':
       get_url()


点击查看原图
夜空- 本站版权
1、本站所有主题由该文章作者发表,该文章作者与夜空享有文章相关版权
2、其他单位或个人使用、转载或引用本文时必须同时征得该文章作者和夜空的同意
3、本帖部分内容转载自其它媒体,但并不代表本站赞同其观点和对其真实性负责
4、如本帖侵犯到任何版权问题,请立即告知本站,本站将及时予与删除并致以最深的歉意
5、原文链接:blog.slogra.com/post-810.html

标签: 抓取 python3 爬虫 每日心语 新闻

评论: