Python 2.7과 selenium + webdriver(chromedriver), BeautifulSoup4를 이용하여 만든 크롤러입니다. 네이버에 로그인하고, 메일 페이지에서 특정 사용자로 검색 및 안 읽은 메일만 가져오도록 필터링하여 메일 내부의 첨부파일을 특정 위치로 다운로드받도록 하여 영상을 찍어 봤습니다. 네이버의 경우 로그인할 때 의심스러운 행위를 자동으로 감지하여 캡차로 막기 때문에 이를 우회하기 위해 네이버 메인을 경유하여 로그인 페이지로 이동하고, 중간중간 랜덤한 시간동안 Sleep하는 코드를 넣어두었습니다. 소스코드는 다음과 같으며, 라이센스는 MIT 이므로 누구나 원하는 곳에 사용이 가능하지만 이 프로그램을 사용함에 있어 발생하는 모든 책임은 사용자에게 있습니다.




소스코드


NaverMailCrawler.py

# The MIT License
#
# Copyright (c) 2018 Sanghyeon Jeon
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import os
import time
from random import randrange
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from bs4 import *
import Utils
# ===== Config =====
HEADLESS = False
SEARCH_KEYWORD = None
DOWNLOAD_PATH = os.path.realpath("download")
USER_ID = "USER_ID"
PASSWORD = "PASSWORD"
LOGIN_URL = "https://nid.naver.com/nidlogin.login"
# ==================
class NaverMailCrawler():
def __init__(self):
self.logger = Utils.CreateLogger("NaverMailCrawler")
self.driverPath = os.path.realpath('chromedriver.exe')
self.driver = None
if not os.path.exists(DOWNLOAD_PATH):
Utils.Mkdirs(DOWNLOAD_PATH)
def __del__(self):
pass
# Decorator for checking driver
def DriverCheck(targetFunction):
def wrapper(self, *args, **kwargs):
if self.driver == None:
self.logger.info("Driver not ready, calling SetDriver()...")
self.SetDriver()
return targetFunction(self, *args, **kwargs)
return wrapper
def SetDriver(self):
options = webdriver.ChromeOptions()
options.headless = HEADLESS
preferences = {
"download.default_directory": DOWNLOAD_PATH,
"directory_upgrade": True,
"safebrowsing.enabled": True,
"profile.default_content_setting_values.automatic_downloads": 2
}
options.add_experimental_option("prefs", preferences)
self.driver = webdriver.Chrome(executable_path = self.driverPath, chrome_options = options)
self.driver.set_page_load_timeout(15) # 15 seconds timeout
self.driver.set_window_size(1000, 600)
self.driver.set_window_position(200, 200)
self.logger.info("Driver Setting Completed")
@DriverCheck
def Login(self, userId, password):
try:
self.driver.get("http://www.naver.com/")
self.driver.implicitly_wait(5)
self.logger.info("Trying to Login : %s" % (LOGIN_URL))
self.driver.get(LOGIN_URL)
self.driver.implicitly_wait(5)
time.sleep(randrange(2,5)) # For bypass captcha
self.driver.find_element_by_name('id').send_keys(userId)
time.sleep(1)
self.driver.find_element_by_name('pw').send_keys(password)
time.sleep(randrange(2,4))
self.driver.find_element_by_xpath('//*[@id="frmNIDLogin"]/fieldset/input').click()
try:
self.driver.find_element_by_class_name('link_login_help')
except NoSuchElementException:
pass
else:
self.logger.error("Login Failed : Captcha Occured, Try Later...")
return False
except Exception as e:
self.logger.error("Login Failed : Unknown Exception (%s)" % (str(e)))
return False
else:
self.logger.info("Login Success")
return True
@DriverCheck
def GetMailSNList(self, keyword = None, unreadOnly = False):
mailSNList = []
try:
self.logger.info("Start Getting MailSNList")
self.driver.get('https://mail.naver.com')
self.driver.implicitly_wait(5)
if keyword is not None:
self.driver.find_element_by_id('searchKeyWord').send_keys(keyword)
self.driver.find_element_by_xpath('//*[@id="searchBtn"]').click()
time.sleep(2)
if unreadOnly:
self.driver.find_element_by_xpath('//*[@id="listBtnMenu"]/div[@class="buttonSet"]/button[6]').click()
time.sleep(0.5)
self.driver.find_element_by_xpath('//*[@id="changeViewFilterLayer"]/div/ul[@class="selector list_filtering"]/li[@data-viewfilter="unread"]').click()
time.sleep(2)
htmlSource = self.driver.page_source
bs = BeautifulSoup(htmlSource, 'html5lib')
mailSNList = [int(x['mailsn']) for x in bs.select('ol.mailList > li')]
self.logger.info("GetMailSNList Success : %d found" % (len(mailSNList)))
except Exception as e:
self.logger.error("GetMailSNList Failed : Unknown Exception (%s)" % (str(e)))
return mailSNList
@DriverCheck
def DownloadAttatchedFiles(self, mailSN):
completedFileList = []
try:
self.driver.get('https://mail.naver.com/read/popup/?nMailId=%d' % (mailSN))
self.driver.implicitly_wait(5)
fileList = self.driver.find_elements_by_xpath('//*[@id="previewContent"]/div[@class="coverWrap"]/div[@class="attfile_area"]/div[@class="file_list"]/ul/li/span/a')
for i in range(len(fileList)):
fileName = fileList[i].get_attribute('title')
self.logger.info("Try Download : %s" % (fileName))
fileList[i].click()
time.sleep(2)
completedFileList.append(fileName)
except Exception as e:
self.logger.error("DownloadAttatchedFiles Error : Unknown Exception (%s)" % (str(e)))
return completedFileList
def main():
crawler = NaverMailCrawler()
if crawler.Login(USER_ID, PASSWORD):
mailSNList = crawler.GetMailSNList(keyword = SEARCH_KEYWORD, unreadOnly = True)
for mailSN in mailSNList:
completedFileList = crawler.DownloadAttatchedFiles(mailSN)
raw_input("[*] Press Enter to exit")
if __name__ == '__main__':
main()



Utils.py

# The MIT License
#
# Copyright (c) 2018 Sanghyeon Jeon
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import os
import logging
import logging.handlers
def CreateLogger(loggerName):
logger = logging.getLogger(loggerName)
if len(logger.handlers) > 0:
# logger already exists
return logger
logPath = os.path.join(os.path.realpath(""), "logs", loggerName + ".log")
Mkdirs(logPath)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('[%(filename)s:%(lineno)s] %(asctime)s > %(levelname)s | %(message)s')
# Create Handlers
streamHandler = logging.StreamHandler()
streamHandler.setLevel(logging.INFO)
streamHandler.setFormatter(formatter)
rotatingHandler = logging.handlers.RotatingFileHandler(logPath, maxBytes=1024 * 1024 * 1024)
rotatingHandler.setLevel(logging.DEBUG)
rotatingHandler.setFormatter(formatter)
# Add handlers to logger
logger.addHandler(streamHandler)
logger.addHandler(rotatingHandler)
return logger
def Mkdirs(filePath):
dirPath = os.path.sep.join(filePath.split(os.path.sep)[:-1])
if not os.path.exists(dirPath):
os.makedirs(dirPath)
def RemoveFile(target, retryCount = 0):
for i in range(retryCount + 1):
try:
os.remove(target)
except:
continue
else:
return True
return False
view raw Utils.py hosted with ❤ by GitHub

블로그 이미지

__미니__

E-mail : skyclad0x7b7@gmail.com 나와 계약해서 슈퍼 하-카가 되어 주지 않을래?

,