Comparing records of last created file with next file : Python Scrapping BS4

I am scrapping this Platform with Selenium and BS4, i am able to retrieve all the information i want. The issue is platform contains sensor readings and few sensors are not updated every day, so i want to compare records of last scrapped file with next file to be created.

Example:

I created xyz.csv file today which contains 200 records and tomorrow i will execute my code to collect information but before creating new file i want to check duplication with last created file. Added the execution will run everyday so in my opinion checking last created file would be good, but i am open to suggestions.

I am using the following code for scrapping the information:

# -*- coding utf-8 -*-



from selenium.webdriver.firefox.options import Options

from selenium import webdriver

import time

import os

import shutil

from bs4 import BeautifulSoup

import uuid

import csv

import dateutil.parser as parser

import pandas as pd

import re

from datetime import datetime as dt

from time import gmtime, strftime, localtime

from selenium.webdriver.support.select import Select



class crawlHydro():



    def __init__(self):

        print("hurray33")

        global downloadDir

        global uFileName

        global filname

        downloadDir = ""

        uFileName = str(uuid.uuid4())

        filname = downloadDir + uFileName + ".csv"

        pd.set_option('display.max_rows', 500)

        pd.set_option('display.max_columns', 500)

        pd.set_option('display.width', 1000)

        # Set Firefox preferences for headless crawling

        fp = webdriver.FirefoxProfile()

        fp.set_preference("browser.download.folderList", 2)

        fp.set_preference("browser.download.manager.showWhenStarting", False)

        fp.set_preference("browser.download.dir", downloadDir)

        fp.set_preference("browser.helperApps.neverAsk.saveToDisk",

                          "attachment/csv")

        options = Options()

        options.add_argument("--headless")

        # Initialize webdriver and target URL

        self.driver = webdriver.Firefox(firefox_profile=fp, firefox_options=options)

        #self.driver = webdriver.Firefox()

        print("hurray")

        self.driver.implicitly_wait(15)

        self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")

        self.verificationErrors = 

        self.accept_next_alert = True



    def crawl(self):

        print("see")

        driver = self.driver

        # Finds elements available on the target page for interaction/action

        driver.execute_script("window.scrollTo(0, 800)")

        driver.find_element_by_id("dijit_MenuItem_3_text").click()

        driver.find_element_by_xpath('//td[.="All"]').click()

        driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()

        soup = BeautifulSoup(driver.page_source, 'html.parser') #Extract page content using BS4

        headers = 

        valueArr = 'LastReadTime'

        for m in soup.find_all("th"):

            headers.append(m.get_text())

        headers.append(valueArr)

        #print(headers)

        new_data = [[c.text.rstrip(" km²") for c in i.find_all('td')]  for i in soup.find_all('table')[5::]] #Extract table content i.e. column and rows

        new_data = [[dt.strptime(i, '%d-%m-%Y %H:%M').strftime('%d-%m-%YT%H:%M+00') if re.match("d{2}-d{2}-d{4}sd{2}:d{2}",i) else i for i in m] for m in new_data] # Convert time to ISO 8601

        timerecorded = strftime("%Y-%m-%dT%H:%M+00", gmtime())

        value = timerecorded

        finalDataList = 

        """Loops removes unnecessary rows"""

        for row in range(len(new_data) - 4):

            finalDataList.append(new_data[row])

        print(finalDataList)



        with open(filname, 'w', newline='') as f:

            writer = csv.writer(f)

            writer.writerow(headers)

            writer.writerows(finalDataList)

        driver.close()

        driver.quit()

        # os.system('pkill firefox')

        # os.system('pkill plugin-container')

        # os.system('pkill geckodriver')clear



if __name__ == '__main__':

    obj = crawlHydro()

    obj.crawl()

Can any one help me follow right direction?

edited Nov 12 '18 at 9:46

asked Nov 12 '18 at 8:17

Northern Shadow

618

Why did not choose to use database, it is easy to find duplicate data and export as csv
– kcorlidy
Nov 12 '18 at 12:49

Thank you for your suggestion, but the idea is to preprocess data before ingesting to some database
– Northern Shadow
Nov 12 '18 at 14:35

add a comment |

Example:

I am using the following code for scrapping the information:

# -*- coding utf-8 -*-



from selenium.webdriver.firefox.options import Options

from selenium import webdriver

import time

import os

import shutil

from bs4 import BeautifulSoup

import uuid

import csv

import dateutil.parser as parser

import pandas as pd

import re

from datetime import datetime as dt

from time import gmtime, strftime, localtime

from selenium.webdriver.support.select import Select



class crawlHydro():



    def __init__(self):

        print("hurray33")

        global downloadDir

        global uFileName

        global filname

        downloadDir = ""

        uFileName = str(uuid.uuid4())

        filname = downloadDir + uFileName + ".csv"

        pd.set_option('display.max_rows', 500)

        pd.set_option('display.max_columns', 500)

        pd.set_option('display.width', 1000)

        # Set Firefox preferences for headless crawling

        fp = webdriver.FirefoxProfile()

        fp.set_preference("browser.download.folderList", 2)

        fp.set_preference("browser.download.manager.showWhenStarting", False)

        fp.set_preference("browser.download.dir", downloadDir)

        fp.set_preference("browser.helperApps.neverAsk.saveToDisk",

                          "attachment/csv")

        options = Options()

        options.add_argument("--headless")

        # Initialize webdriver and target URL

        self.driver = webdriver.Firefox(firefox_profile=fp, firefox_options=options)

        #self.driver = webdriver.Firefox()

        print("hurray")

        self.driver.implicitly_wait(15)

        self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")

        self.verificationErrors = 

        self.accept_next_alert = True



    def crawl(self):

        print("see")

        driver = self.driver

        # Finds elements available on the target page for interaction/action

        driver.execute_script("window.scrollTo(0, 800)")

        driver.find_element_by_id("dijit_MenuItem_3_text").click()

        driver.find_element_by_xpath('//td[.="All"]').click()

        driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()

        soup = BeautifulSoup(driver.page_source, 'html.parser') #Extract page content using BS4

        headers = 

        valueArr = 'LastReadTime'

        for m in soup.find_all("th"):

            headers.append(m.get_text())

        headers.append(valueArr)

        #print(headers)

        new_data = [[c.text.rstrip(" km²") for c in i.find_all('td')]  for i in soup.find_all('table')[5::]] #Extract table content i.e. column and rows

        new_data = [[dt.strptime(i, '%d-%m-%Y %H:%M').strftime('%d-%m-%YT%H:%M+00') if re.match("d{2}-d{2}-d{4}sd{2}:d{2}",i) else i for i in m] for m in new_data] # Convert time to ISO 8601

        timerecorded = strftime("%Y-%m-%dT%H:%M+00", gmtime())

        value = timerecorded

        finalDataList = 

        """Loops removes unnecessary rows"""

        for row in range(len(new_data) - 4):

            finalDataList.append(new_data[row])

        print(finalDataList)



        with open(filname, 'w', newline='') as f:

            writer = csv.writer(f)

            writer.writerow(headers)

            writer.writerows(finalDataList)

        driver.close()

        driver.quit()

        # os.system('pkill firefox')

        # os.system('pkill plugin-container')

        # os.system('pkill geckodriver')clear



if __name__ == '__main__':

    obj = crawlHydro()

    obj.crawl()

Can any one help me follow right direction?

edited Nov 12 '18 at 9:46

asked Nov 12 '18 at 8:17

Northern Shadow

618

Why did not choose to use database, it is easy to find duplicate data and export as csv
– kcorlidy
Nov 12 '18 at 12:49

Thank you for your suggestion, but the idea is to preprocess data before ingesting to some database
– Northern Shadow
Nov 12 '18 at 14:35

add a comment |

Example:

I am using the following code for scrapping the information:

# -*- coding utf-8 -*-



from selenium.webdriver.firefox.options import Options

from selenium import webdriver

import time

import os

import shutil

from bs4 import BeautifulSoup

import uuid

import csv

import dateutil.parser as parser

import pandas as pd

import re

from datetime import datetime as dt

from time import gmtime, strftime, localtime

from selenium.webdriver.support.select import Select



class crawlHydro():



    def __init__(self):

        print("hurray33")

        global downloadDir

        global uFileName

        global filname

        downloadDir = ""

        uFileName = str(uuid.uuid4())

        filname = downloadDir + uFileName + ".csv"

        pd.set_option('display.max_rows', 500)

        pd.set_option('display.max_columns', 500)

        pd.set_option('display.width', 1000)

        # Set Firefox preferences for headless crawling

        fp = webdriver.FirefoxProfile()

        fp.set_preference("browser.download.folderList", 2)

        fp.set_preference("browser.download.manager.showWhenStarting", False)

        fp.set_preference("browser.download.dir", downloadDir)

        fp.set_preference("browser.helperApps.neverAsk.saveToDisk",

                          "attachment/csv")

        options = Options()

        options.add_argument("--headless")

        # Initialize webdriver and target URL

        self.driver = webdriver.Firefox(firefox_profile=fp, firefox_options=options)

        #self.driver = webdriver.Firefox()

        print("hurray")

        self.driver.implicitly_wait(15)

        self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")

        self.verificationErrors = 

        self.accept_next_alert = True



    def crawl(self):

        print("see")

        driver = self.driver

        # Finds elements available on the target page for interaction/action

        driver.execute_script("window.scrollTo(0, 800)")

        driver.find_element_by_id("dijit_MenuItem_3_text").click()

        driver.find_element_by_xpath('//td[.="All"]').click()

        driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()

        soup = BeautifulSoup(driver.page_source, 'html.parser') #Extract page content using BS4

        headers = 

        valueArr = 'LastReadTime'

        for m in soup.find_all("th"):

            headers.append(m.get_text())

        headers.append(valueArr)

        #print(headers)

        new_data = [[c.text.rstrip(" km²") for c in i.find_all('td')]  for i in soup.find_all('table')[5::]] #Extract table content i.e. column and rows

        new_data = [[dt.strptime(i, '%d-%m-%Y %H:%M').strftime('%d-%m-%YT%H:%M+00') if re.match("d{2}-d{2}-d{4}sd{2}:d{2}",i) else i for i in m] for m in new_data] # Convert time to ISO 8601

        timerecorded = strftime("%Y-%m-%dT%H:%M+00", gmtime())

        value = timerecorded

        finalDataList = 

        """Loops removes unnecessary rows"""

        for row in range(len(new_data) - 4):

            finalDataList.append(new_data[row])

        print(finalDataList)



        with open(filname, 'w', newline='') as f:

            writer = csv.writer(f)

            writer.writerow(headers)

            writer.writerows(finalDataList)

        driver.close()

        driver.quit()

        # os.system('pkill firefox')

        # os.system('pkill plugin-container')

        # os.system('pkill geckodriver')clear



if __name__ == '__main__':

    obj = crawlHydro()

    obj.crawl()

Can any one help me follow right direction?

edited Nov 12 '18 at 9:46

asked Nov 12 '18 at 8:17

Northern Shadow

618

Example:

I am using the following code for scrapping the information:

# -*- coding utf-8 -*-



from selenium.webdriver.firefox.options import Options

from selenium import webdriver

import time

import os

import shutil

from bs4 import BeautifulSoup

import uuid

import csv

import dateutil.parser as parser

import pandas as pd

import re

from datetime import datetime as dt

from time import gmtime, strftime, localtime

from selenium.webdriver.support.select import Select



class crawlHydro():



    def __init__(self):

        print("hurray33")

        global downloadDir

        global uFileName

        global filname

        downloadDir = ""

        uFileName = str(uuid.uuid4())

        filname = downloadDir + uFileName + ".csv"

        pd.set_option('display.max_rows', 500)

        pd.set_option('display.max_columns', 500)

        pd.set_option('display.width', 1000)

        # Set Firefox preferences for headless crawling

        fp = webdriver.FirefoxProfile()

        fp.set_preference("browser.download.folderList", 2)

        fp.set_preference("browser.download.manager.showWhenStarting", False)

        fp.set_preference("browser.download.dir", downloadDir)

        fp.set_preference("browser.helperApps.neverAsk.saveToDisk",

                          "attachment/csv")

        options = Options()

        options.add_argument("--headless")

        # Initialize webdriver and target URL

        self.driver = webdriver.Firefox(firefox_profile=fp, firefox_options=options)

        #self.driver = webdriver.Firefox()

        print("hurray")

        self.driver.implicitly_wait(15)

        self.driver.get("http://www.epa.ie/hydronet/#Water%20Levels")

        self.verificationErrors = 

        self.accept_next_alert = True



    def crawl(self):

        print("see")

        driver = self.driver

        # Finds elements available on the target page for interaction/action

        driver.execute_script("window.scrollTo(0, 800)")

        driver.find_element_by_id("dijit_MenuItem_3_text").click()

        driver.find_element_by_xpath('//td[.="All"]').click()

        driver.find_element_by_xpath('//td[.="Active EPA/LA (239)"]').click()

        soup = BeautifulSoup(driver.page_source, 'html.parser') #Extract page content using BS4

        headers = 

        valueArr = 'LastReadTime'

        for m in soup.find_all("th"):

            headers.append(m.get_text())

        headers.append(valueArr)

        #print(headers)

        new_data = [[c.text.rstrip(" km²") for c in i.find_all('td')]  for i in soup.find_all('table')[5::]] #Extract table content i.e. column and rows

        new_data = [[dt.strptime(i, '%d-%m-%Y %H:%M').strftime('%d-%m-%YT%H:%M+00') if re.match("d{2}-d{2}-d{4}sd{2}:d{2}",i) else i for i in m] for m in new_data] # Convert time to ISO 8601

        timerecorded = strftime("%Y-%m-%dT%H:%M+00", gmtime())

        value = timerecorded

        finalDataList = 

        """Loops removes unnecessary rows"""

        for row in range(len(new_data) - 4):

            finalDataList.append(new_data[row])

        print(finalDataList)



        with open(filname, 'w', newline='') as f:

            writer = csv.writer(f)

            writer.writerow(headers)

            writer.writerows(finalDataList)

        driver.close()

        driver.quit()

        # os.system('pkill firefox')

        # os.system('pkill plugin-container')

        # os.system('pkill geckodriver')clear



if __name__ == '__main__':

    obj = crawlHydro()

    obj.crawl()

Can any one help me follow right direction?

python web-scraping beautifulsoup duplicates

edited Nov 12 '18 at 9:46

asked Nov 12 '18 at 8:17

Northern Shadow

618

edited Nov 12 '18 at 9:46

asked Nov 12 '18 at 8:17

Northern Shadow

618

edited Nov 12 '18 at 9:46

asked Nov 12 '18 at 8:17

Northern Shadow

618

asked Nov 12 '18 at 8:17

Northern Shadow

618

asked Nov 12 '18 at 8:17

Northern Shadow

618

Why did not choose to use database, it is easy to find duplicate data and export as csv
– kcorlidy
Nov 12 '18 at 12:49

Thank you for your suggestion, but the idea is to preprocess data before ingesting to some database
– Northern Shadow
Nov 12 '18 at 14:35

add a comment |

Why did not choose to use database, it is easy to find duplicate data and export as csv
– kcorlidy
Nov 12 '18 at 12:49

Thank you for your suggestion, but the idea is to preprocess data before ingesting to some database
– Northern Shadow
Nov 12 '18 at 14:35

Why did not choose to use database, it is easy to find duplicate data and export as csv
– kcorlidy
Nov 12 '18 at 12:49

Thank you for your suggestion, but the idea is to preprocess data before ingesting to some database
– Northern Shadow
Nov 12 '18 at 14:35

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53258152%2fcomparing-records-of-last-created-file-with-next-file-python-scrapping-bs4%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

Some of your past answers have not been well-received, and you're in danger of being blocked from answering.

Please pay close attention to the following guidance:

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Nrthugu