项目作者: superonesfazai

项目描述 :
? 这是fz的python utils包, for Spider! enjoy!
高级语言: Python
项目地址: git://github.com/superonesfazai/fzutils.git


  1. ███████╗███████╗██╗ ██╗████████╗██╗██╗ ███████╗
  2. ██╔════╝╚══███╔╝██║ ██║╚══██╔══╝██║██║ ██╔════╝
  3. █████╗ ███╔╝ ██║ ██║ ██║ ██║██║ ███████╗
  4. ██╔══╝ ███╔╝ ██║ ██║ ██║ ██║██║ ╚════██║
  5. ██║ ███████╗╚██████╔╝ ██║ ██║███████╗███████║
  6. ╚═╝ ╚══════╝ ╚═════╝ ╚═╝ ╚═╝╚══════╝╚══════╝

Build Status
GitHub license
GitHub forks
GitHub stars

Twitter

fzutils

这是什么?

这是fz的python utils包, for Spider.

旨在: 高效快速的进行爬虫开发的集成包

Install

  1. pip3 install fzutils

要求

simple use

  1. from fzutils.ip_pools import (
  2. IpPools,
  3. ip_proxy_pool,
  4. fz_ip_pool,)
  5. # 高匿
  6. # type默认是ip_proxy_pool, 可修改为fz_ip_pool, 具体看你使用哪个ip池
  7. ip_obj = IpPools(type=ip_proxy_pool, high_conceal=True)
  8. # 得到一个随机ip, eg: 'http://175.6.2.174:8088'
  9. proxy = ip_obj._get_random_proxy_ip()
  1. from fzutils.spider.crawler import Crawler, AsyncCrawler
  2. from fzutils.ip_pools import fz_ip_pool
  3. class ASpider(Crawler): # Crawler为爬虫基类
  4. def __init__(self, logger=None) -> None:
  5. super(ASpider, self).__init__(
  6. ip_pool_type=fz_ip_pool,
  7. log_print=True,
  8. logger=logger,
  9. log_save_path='log文件存储path',
  10. is_use_driver=True,
  11. driver_executable_path='驱动path',
  12. )
  13. class BSpider(AsyncCrawler):
  14. """异步爬虫"""
  15. pass
  16. _ = ASpider()
  1. from fzutils.spider.fz_driver import BaseDriver, PHANTOMJS
  2. from fzutils.ip_pools import ip_proxy_pool
  3. # ip_pool_type默认也是ip_proxy_pool
  4. # BaseDriver支持phantomjs, chromedriver, firefoxdriver
  5. _ = BaseDriver(type=PHANTOMJS, executable_path='xxx', ip_pool_type=ip_proxy_pool)
  6. exec_code = '''
  7. js = 'document.body.scrollTop=10000'
  8. self.driver.execute_script(js)
  9. '''
  10. body = _.get_url_body(url='xxx', exec_code=exec_code)
  1. from fzutils.spider.fz_requests import Requests
  2. from fzutils.ip_pools import ip_proxy_pool
  3. # ip_pool_type默认也是ip_proxy_pool
  4. body = Requests.get_url_body(method='get', url='xxx', ip_pool_type=ip_proxy_pool)
  1. import asyncio
  2. from fzutils.spider.fz_aiohttp import AioHttp
  3. async def tmp():
  4. _ = AioHttp(max_tasks=5)
  5. return await _.aio_get_url_body(url='xxx', headers={})
  1. from fzutils.time_utils import (
  2. fz_set_timeout,
  3. fz_timer,)
  4. from time import sleep
  5. import sys
  6. # 设置执行超时
  7. @fz_set_timeout(2)
  8. def tmp():
  9. sleep(3)
  10. # 计算函数用时, 支持sys.stdout.write or logger.info
  11. @fz_timer(print_func=sys.stdout.write)
  12. def tmp_2():
  13. sleep(3)
  14. tmp()
  15. tmp_2()
  1. from fzutils.log_utils import set_logger
  2. from logging import INFO, ERROR
  3. logger = set_logger(log_file_name='path', console_log_level=INFO, file_log_level=ERROR)
  1. from fzutils.auto_ops_utils import auto_git
  2. # 自动化git
  3. auto_git(path='xxx/path')
  1. from fzutils.path_utils import cd
  2. # cd 到目标上下文并进行其他操作
  3. with cd('path'):
  4. pass
  1. from fzutils.sql_utils import (
  2. BaseSqlServer,
  3. pretty_table,)
  4. _ = BaseSqlServer(host='host', user='user', passwd='passwd', db='db', port='port')
  5. # db美化打印
  6. pretty_table(
  7. cursor=_._get_one_select_cursor(
  8. sql_str='sql_str',
  9. params=('some_thing',)))
  1. from fzutils.linux_utils import (
  2. kill_process_by_name,
  3. process_exit,)
  4. # 根据process_name kill process
  5. kill_process_by_name(process_name='xxxx')
  6. # 根据process_name 判断process是否存在
  7. process_exit(process_name='xxxx')
  1. from fzutils.linux_utils import daemon_init
  2. def run_forever():
  3. pass
  4. # 守护进程
  5. daemon_init()
  6. run_forever()
  1. from fzutils.internet_utils import (
  2. get_random_pc_ua,
  3. get_random_phone_ua,)
  4. # 随机user-agent
  5. pc_user_agent = get_random_pc_ua()
  6. phone_user_agent = get_random_phone_ua()
  1. from fzutils.common_utils import _print
  2. # 支持sys.stdout.write or logger
  3. _print(msg='xxx', logger=logger, exception=e, log_level=2)
  1. from fzutils.auto_ops_utils import (
  2. upload_or_download_files,
  3. local_compress_folders,
  4. remote_decompress_folders,)
  5. from fabric.connection import Connection
  6. connect_obj = Connection()
  7. # local 与 server端 上传或下载文件
  8. upload_or_download_files(
  9. method='put',
  10. connect_object=connect_obj,
  11. local_file_path='/Users/afa/myFiles/tmp/my_spider_logs.zip',
  12. remote_file_path='/root/myFiles/my_spider_logs.zip'
  13. )
  14. # 本地解压zip文件
  15. local_compress_folders(
  16. father_folders_path='/Users/afa/myFiles',
  17. folders_name='my_spider_logs',
  18. default_save_path='xxxxxx'
  19. )
  20. # 远程解压zip文件
  21. remote_decompress_folders(
  22. connect_object=connect_obj,
  23. folders_path='/root/myFiles/my_spider_logs.zip',
  24. target_decompress_path='/root/myFiles/'
  25. )
  1. from fzutils.common_utils import json_2_dict
  2. # json转dict, 处理部分不规范json
  3. _dict = json_2_dict(json_str='json_str', logger=logger, encoding='utf-8')
  1. from fzutils.auto_ops_utils import judge_whether_file_exists
  2. from fabric.connection import Connection
  3. connect_obj = Connection()
  4. # 判断server文件是否存在
  5. result = judge_whether_file_exists(connect_object=connect_obj, file_path='file_path')
  1. from fzutils.email_utils import FZEmail
  2. _ = FZEmail(user='xxx', passwd='密码 or smtp授权码')
  3. _.send_email(to=['xxx@gmail.com',], subject='邮件正文', text='邮件内容')
  1. from requests import sessions
  2. from fzutils.common_utils import (
  3. save_obj,
  4. get_obj,)
  5. s = sessions()
  6. # 对象持久化存储
  7. save_obj(s, 's.txt')
  8. get_obj('s.txt')
  1. from fzutils.data.str_utils import (
  2. char_is_chinese,
  3. char_is_alphabet,
  4. char_is_number,
  5. char_is_other,)
  6. # 单字符判断其类型
  7. print(char_is_chinese('你'))
  8. print(char_is_alphabet('a'))
  9. print(char_is_number('1'))
  10. print(char_is_other('_'))
  1. from fzutils.algorithm_utils import merge_sort
  2. # 归并排序
  3. print(merge_sort([-1, 2, 1]))
  4. # 还有很多其他排序方法
  1. from fzutils.data.pickle_utils import deserializate_pickle_object
  2. from pickle import dumps
  3. a = dumps({'1':1,})
  4. # 反序列化python对象
  5. print(deserializate_pickle_object(a))
  1. from fzutils.aio_utils import get_async_execute_result
  2. # 获取异步执行结果
  3. res = get_async_execute_result(obj='xxx类', obj_method_name='xxx类方法',)
  1. from fzutils.common_utils import retry
  2. def validate_res(res):
  3. '''验证结果的函数'''
  4. if res == 5:
  5. return True
  6. else:
  7. return False
  8. # 重试装饰器
  9. @retry(max_retries=4, validate_func=validate_res)
  10. def a(t):
  11. return t - 2
  12. print(a(7))

curl

curl cmd 转 python 代码

  1. from fzutils.curl_utils import curl_cmd_2_py_code
  2. # 使用前提(已安装: npm install --save curlconverter)
  3. curl_cmd = "curl 'http://en.wikipedia.org/' -H 'Accept-Encoding: gzip, deflate, sdch' -H 'Accept-Language: en-US,en;q=0.8' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' -H 'Referer: http://www.wikipedia.org/' -H 'Cookie: GeoIP=US:Albuquerque:35.1241:-106.7675:v4; uls-previous-languages=%5B%22en%22%5D; mediaWiki.user.sessionId=VaHaeVW3m0ymvx9kacwshZIDkv8zgF9y; centralnotice_buckets_by_campaign=%7B%22C14_enUS_dsk_lw_FR%22%3A%7B%22val%22%3A%220%22%2C%22start%22%3A1412172000%2C%22end%22%3A1422576000%7D%2C%22C14_en5C_dec_dsk_FR%22%3A%7B%22val%22%3A3%2C%22start%22%3A1417514400%2C%22end%22%3A1425290400%7D%2C%22C14_en5C_bkup_dsk_FR%22%3A%7B%22val%22%3A1%2C%22start%22%3A1417428000%2C%22end%22%3A1425290400%7D%7D; centralnotice_bannercount_fr12=22; centralnotice_bannercount_fr12-wait=14' -H 'Connection: keep-alive' --compressed"
  4. res = curl_cmd_2_py_code(curl_cmd)

ocr识别

  1. from fzutils.ocr_utils import (
  2. baidu_ocr_captcha,
  3. baidu_orc_image_main_body,
  4. get_tracks_based_on_distance,
  5. dichotomy_match_gap_distance,)
  6. # 百度orc识别captcha
  7. captcah = baidu_ocr_captcha(
  8. app_id='xx',
  9. api_key='xx',
  10. secret_key='xx',
  11. img_path='图片地址',
  12. orc_type=2)
  13. # 百度ocr识别图片主体内容位置
  14. img_url = 'https://www.baidu.com/link?url=phUVHvSMIfwj2DPXnprj0BTv4loPocnLfNn-CVb7UQE4NLe7PH8GbrYKDkX2hzyp17Eqhy-s1rP8Zg92NEt0vqUxm_nhLoyRTaaxMFwq1oMdPaG_krazDsxHgLlql9QkZB92VhsTirtG53MvyecIFLjWeHjdyGCyTOaS-UcksfOJkPFOAJOFe4AoCxW5qQUbTahhjhjXWyihP-XmYIR5z-Gt3esBvFJpuHhUy7W6OODMrUZ2v7mUa9ng2BFKDy2MREyZQcXW80D3eDqWbIFLQ5BtEqWEknWa_1kxKXf4qo7GAZjkANyTP8D2PN0jHRw2AiWtN3d57J6GP4hksByVAzwIJWeWIiObv69Q1ekb2O_WsYLbKfzIsVLdlZGm5SHXnMgKZkRay_I8NKeq-wUb2wLKsGCjhRC1AV-GSv5Q7fIEj1QrSgQjLnW6Fjh55M5AaM9JRJLlXWhANegCn6jpJhnL7vcV1-kDgUcKQVFNq27fol2E2fG-d7ja03dizHCawAsIr6ortoWeqDdpyW4VOesI1VU6_WDdAWs96KZqVD2gATBs1U_D5nbYC9DAuZYK&wd=&eqid=81209347000143bf000000035b933e62'
  15. res = baidu_orc_image_main_body(img_url=img_url)
  16. # 根据给与距离生成仿生移动轨迹
  17. tracks = get_tracks_based_on_distance(distance=100)
  18. # 二分法匹配滑块与缺口间的距离
  19. distance = dichotomy_match_gap_distance(bg_img_path='xxx', slide_img_path='xxx')

qrcode

二维码解码

  1. from fzutils.qrcode_utils import decode_qrcode
  2. img_url = 'https://i.loli.net/2018/11/15/5bed1adce184e.jpg'
  3. print(decode_qrcode(img_url=img_url))

批量注册账号

  1. from pprint import pprint
  2. from fzutils.register_utils import YiMaSmser
  3. _ = YiMaSmser(username='账号', pwd='密码')
  4. project_id = 715
  5. while True:
  6. # 获取新手机号
  7. phone_num = _._get_phone_num(project_id=project_id)
  8. print(phone_num)
  9. a = input('是否可用: ')
  10. if a == 'y':
  11. break
  12. print('\n未注册的: {}'.format(phone_num))
  13. # 获取该手机号的短信
  14. sms_res = _._get_sms(phone_num=phone_num, project_id=project_id)
  15. print(sms_res)
  16. # 查看自己的账户余额
  17. money_res = _._get_account_info()
  18. pprint(money_res)
  1. from time import time, sleep
  2. from fzutils.register_utils import TwentyFourEmail
  3. _ = TwentyFourEmail()
  4. email_address = _._get_email_address()
  5. print('获取到的email_address: {}'.format(email_address))
  6. # # 换个邮箱
  7. # email_address = _._get_new_email_address()
  8. # print(email_address)
  9. message_count = lambda : _._get_email_message_count()
  10. start_time = time()
  11. index = 1
  12. while message_count() in (0, None) and time() - start_time < 100.:
  13. sleep_time = 2
  14. print('{} try, 休眠{}s...'.format(index, sleep_time))
  15. sleep(sleep_time)
  16. index += 1
  17. message_list = _._get_email_message_list()
  18. print(message_list)

代码模板生成

  1. from fzutils.spider.auto import auto_generate_crawler_code
  2. # 爬虫基本代码自动生成器
  3. auto_generate_crawler_code()
  4. """
  5. shell输出如下:
  6. #--------------------------------
  7. # 爬虫模板自动生成器 by super_fazai
  8. #--------------------------------
  9. @@ 下面是备选参数, 无输入则取默认值!!
  10. 请输入author:super_fazai
  11. 请输入email:superonesfazai@gmail.com
  12. 请输入创建的文件名(不含.py):fz_spider_demo
  13. 请输入class_name:FZSpiderDemo
  14. 创建爬虫文件fz_spider_demo.py完毕!
  15. enjoy!🍺
  16. """
  1. # 还有很多其他常用函数, 待您探索...

资源

fzutils的home < https://www.github.com/superonesfazai/python >

版权和保修

此发行版中的代码为版权所有 (c) super_fazai, 除非另有明确说明.

fzutils根据MIT许可证提供, 包含的LICENSE文件详细描述了这一点.

贡献者

  • super_fazai

作者

super_fazai