데이터 공부를 기록하는 공간

[crawling, flask] 프로젝트2 - 뉴스 및 네이버쇼핑 크롤링 페이지 본문

STUDY/MLOPS

[crawling, flask] 프로젝트2 - 뉴스 및 네이버쇼핑 크롤링 페이지

BOTTLE6 2021. 12. 19. 22:06

 

 

<app.py>

- 크롬 웹드라이버 다운로드 : https://chromedriver.chromium.org/downloads

- 크롤링 시, 크롬-개발자도구 - copy select 기능 활용

from flask import Flask, render_template, request

app = Flask(__name__)

import requests
from bs4 import BeautifulSoup
import time

# 엑셀 쓰기 위한 준비
from openpyxl import Workbook
write_wb = Workbook()
write_ws = write_wb.active

from selenium import webdriver

@app.route('/')
def hello_world() :
    return render_template("index.html")


@app.route('/result', methods=['POST'])
def result():

    keyword = request.form['input1']
    page = request.form['input2']
    daum_list = []

    for num in range(1,int(page)+1):
        url = "https://search.daum.net/search?w=news&nil_search=btn&DA=NTB&enc=utf8&cluster=y&cluster_page=1&q=" + keyword + "&p=" + str(num)
        req = requests.get(url)
        soup = BeautifulSoup(req.text, "html.parser")

        for i in soup.find_all("a", class_="tit_main fn_tit_u"):
            print(i.text)
            daum_list.append(i.text)
    
    for i in range(1, len(daum_list)+1):
        write_ws.cell(i, 1, daum_list[i-1])
    write_wb.save("static/result.xlsx")

    return render_template("result.html", daum_list = daum_list)


@app.route('/naver_shopping', methods = ['POST'])
def naver_shopping() :

    search = request.form['input3']
    search_list = []
    search_list_src = []
    driver = webdriver.Chrome("./chromedriver")
    #3초 기다려주기, 웹페이지 로딩까지
    driver.implicitly_wait(3)  
    
    driver.get("https://search.shopping.naver.com/search/all?query=" + search)

    #스크롤 내리기
    y=1000
    for timer in range(0,5):
        driver.execute_script("window.scrollTo(0, " + str(y) + ")" )
        y=y+1000
        time.sleep(1)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    select = "#__next > div > div.style_container__1YjHN > div.style_inner__18zZX > div.style_content_wrap__1PzEo > div.style_content__2T20F > ul > div"
    for i in soup.select(select)[0].find_all("div", class_='basicList_title__3P9Q7') :
        #print(i.text)              
        search_list.append(i.text)
    
    #img
    for i in soup.select(select)[0].find_all("img"):
        #print(i['src'])              
        search_list_src.append(i['src'])

    print("-------0-0-0-0-0-0-0-------")
    # 네이버쇼핑에서 해외직구 버튼을 눌러서 이동
    # 검색하는 물건마다 css_seletor 변수 값이 변해야 하는 듯..
    css_selector = "#__next > div > div.style_container__1YjHN > div.style_inner__18zZX > div.style_content_wrap__1PzEo > div.style_content__2T20F > div.seller_filter_area > ul > li.active > a"
    driver.find_element_by_css_selector(css_selector).click()
    time.sleep(1)
    #스크롤 내리기
    y=1000
    for timer in range(0,5):
        driver.execute_script("window.scrollTo(0, " + str(y) + ")" )
        y=y+1000
        time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, "html.parser")
    select = "#__next > div > div.style_container__1YjHN > div.style_inner__18zZX > div.style_content_wrap__1PzEo > div.style_content__2T20F > ul > div"
    for i in soup.select(select)[0].find_all("div", class_='basicList_title__3P9Q7') :
        #print(i.text)
        search_list.append(i.text)
    #img
    for i in soup.select(select)[0].find_all("img"):
        #print(i['src'])  
        search_list_src.append(i['src'])


    driver.close()

    return render_template("shopping.html", 
                                search_list=search_list,
                                search_list_src = search_list_src,
                                len = len(search_list_src))

if __name__ == "__main__":
    app.run()

</templates/index.html>

- 부트스트랩 : http://bootstrapk.com/getting-started/#download

- 부트스트랩 - CSS - 폼 : http://bootstrapk.com/css/#forms

<!DOCTYPE html>
<html lang='en'>
<head>
    <meta charset="UTF-8">
    <title>Title</title>

    <!-- 1. 부트스트랩 - 다운로드 - 복사/붙여넣기 -->    
    <!-- 합쳐지고 최소화된 최신 CSS -->
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.2/css/bootstrap.min.css">

    <!-- 부가적인 테마 -->
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.2/css/bootstrap-theme.min.css">

    <!-- 합쳐지고 최소화된 최신 자바스크립트 -->
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.2/js/bootstrap.min.js"></script>    

    <!-- 4. css/index.css 연결하기 -->
    <link rel='stylesheet', href={{ url_for('static', filename='css/index.css')}}>

</head>
<body>
    <!-- 3. div id='wrap'으로 감싸기 -->
    <div id='wrap'>
    <!-- 2. 부트스트랩 - CSS - 폼 - 복사/붙여넣기 -->
        <!-- 4. 액션과 매소드 입력 -->
        
        <div>
            <div class="news">
                <form action='/result' method='POST'>
                    
                    <div class="form-group"> 
                        <h1>키워드 입력</h1>
                        <label for="exampleInputEmail1">크롤링 키워드</label>
                        <input type="text" class="form-control" name='input1' placeholder="키워드를 입력해주세요">
                    </div>
                    <div class="form-group">
                        <h1>페이지 입력</h1>
                        <label for="exampleInputPassword1">페이지 수</label>
                        <input type="number" class="form-control" name='input2' placeholder="페이지수를 입력해주세요">
                    </div>
                    
                    <!--    
                    <div class="form-group">
                    <label for="exampleInputFile">파일 업로드</label>
                    <input type="file" id="exampleInputFile">
                    <p class="help-block">여기에 블록레벨 도움말 예제</p>
                    </div>
                    <div class="checkbox">
                    <label>
                        <input type="checkbox"> 입력을 기억합니다
                    </label>
                    </div>
                    -->
                    <button type="submit" class="btn btn-default">제출</button>
                    
                </form>
            </div>
        </div>

        <div>
            <div class="naver">
                <form action = "/naver_shopping" method='POST'>                   
                    <h1> 네이버 쇼핑 크롤링 </h1>
                    
                    <div class="form-group">
                        
                        <input type="text" class="form-control" name='input3'>
                    </div>
                    
                    <button type="submit" class="btn btn-default"> 네이버 쇼핑 크롤링 셀레니움 </button>

                </form>    
            </div>
        </div>
    </div>
</body>
</html>

 


</templates/result.html>

<!DOCTYPE html>
<html lang='en'>
<head>
    <meta charset="UTF-8">
    <title>Title</title>

</head>
<body>
    <ul>
    {% for i in daum_list %}
        <li>{{ i }}</li>

    {% endfor %}
    </ul>    

    <a href="static/result.xlsx">다운로드 버튼</a>
</body>
</html>

 


</templates/shopping.html>

<!DOCTYPE html>
<html lang='en'>
<head>
    <meta charset="UTF-8">
    <title>Title</title>

</head>
<body>
    
    <ul>
    {% for i in range(0,len) %}
        
        <li>{{ search_list[i] }}</li>
        <li><img src={{search_list_src[i]}}></li>
    {% endfor %}
    </ul>
</body>
</html>

 


</static/css/index.css>

#wrap {
    width: 600px;
    margin: auto;
}
.news {

    margin-top : 600px; 
}
.naver {
    
    margin-top : 600px; 
}

새롭게 알게된 점 : 

- selenium은 동적으로 크롤링이 가능하게 해 줌 : 버튼클릭, 스크롤링 등 

- openpyxl

 

 


결과

 

<127.0.0.1:5000>

<"라이언"으로 뉴스 검색>

 

<"라이언"으로 네이버쇼핑 크롤링>

Comments