diff --git a/.gitignore b/.gitignore index fc2cb1e..eb45148 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,8 @@ # project settings.py output.log +.project +.pydevproject # python specific *.pyc diff --git a/.project b/.project new file mode 100644 index 0000000..6d730ed --- /dev/null +++ b/.project @@ -0,0 +1,17 @@ + + + dumpmon + + + + + + org.python.pydev.PyDevBuilder + + + + + + org.python.pydev.pythonNature + + diff --git a/.pydevproject b/.pydevproject new file mode 100644 index 0000000..40e9f40 --- /dev/null +++ b/.pydevproject @@ -0,0 +1,5 @@ + + +Default +python 2.7 + diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dumpmon.py b/dumpmon.py index a315673..b517f2c 100644 --- a/dumpmon.py +++ b/dumpmon.py @@ -12,12 +12,17 @@ from lib.Pastebin import Pastebin, PastebinPaste from lib.Slexy import Slexy, SlexyPaste from lib.Pastie import Pastie, PastiePaste -from lib.helper import log +from lib.HaveIBeen import HaveIBeen, HaveIBeenPaste + +from lib.helper import log, createThread +from lib.TwitterBot import TwitterBot +from lib.RegexMgr import RegexMgr +from lib.Stats import Stats from time import sleep -from twitter import Twitter, OAuth -from settings import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET, log_file +from settings import log_file import threading import logging +from logging.handlers import RotatingFileHandler def monitor(): @@ -30,36 +35,55 @@ def monitor(): parser.add_argument( "-v", "--verbose", help="more verbose", action="store_true") args = parser.parse_args() + level = logging.INFO if args.verbose: level = logging.DEBUG + logging.basicConfig( - format='%(asctime)s [%(levelname)s] %(message)s', filename=log_file, level=level) + format='%(asctime)s [%(levelname)s][%(module)s][%(funcName)s] %(message)s', filename=log_file, level=level) + + handler = RotatingFileHandler(log_file, maxBytes=20*1000, + backupCount=5) + #logging.addHandler(handler) + logging.info('Monitoring...') - bot = Twitter( - auth=OAuth(ACCESS_TOKEN, ACCESS_TOKEN_SECRET, - CONSUMER_KEY, CONSUMER_SECRET) - ) - # Create lock for both output log and tweet action + + regexMgr = RegexMgr() + bot = TwitterBot(regexMgr) + + # Create lock for output log log_lock = threading.Lock() - tweet_lock = threading.Lock() - - pastebin_thread = threading.Thread( - target=Pastebin().monitor, args=[bot, tweet_lock]) - slexy_thread = threading.Thread( - target=Slexy().monitor, args=[bot, tweet_lock]) - pastie_thead = threading.Thread( - target=Pastie().monitor, args=[bot, tweet_lock]) - - for thread in (pastebin_thread, slexy_thread, pastie_thead): - thread.daemon = True - thread.start() + + #create an event to tell threads to keep running + isRunning = threading.Event() + isRunning.set() + + #array to keep a handle on threads + workers = [] + + #these next 2 workers don't need to be joined when termd + createThread(bot.monitor) + createThread(Stats().monitor,bot) + + #these workers need to be shut down gracefully + workers.append(createThread(HaveIBeen().monitor,bot,isRunning)) + workers.append(createThread(Pastebin().monitor,bot,isRunning)) + workers.append(createThread(Slexy().monitor,bot,isRunning)) + workers.append(createThread(Pastie().monitor,bot,isRunning)) # Let threads run try: while(1): sleep(5) except KeyboardInterrupt: + #signal threads to shutdown + isRunning.clear() + print 'stopping' + #wait for threads to join + for t in workers: + t.join() + print 'stopped' logging.warn('Stopped.') diff --git a/lib/HaveIBeen.py b/lib/HaveIBeen.py new file mode 100644 index 0000000..94024e1 --- /dev/null +++ b/lib/HaveIBeen.py @@ -0,0 +1,60 @@ +""" +Troy Hunt's RSS Feed for the last 50 pastes + +http://feeds.feedburner.com/HaveIBeenPwnedLatestPastes + +""" +import feedparser + +from .Site import Site +from .Paste import Paste +from bs4 import BeautifulSoup +from . import helper +from time import sleep +from settings import SLEEP_HAVEIBEEN +from twitter import TwitterError +import logging + +class HaveIBeenPaste(Paste): + def __init__(self, id): + super(HaveIBeenPaste, self).__init__(id) + self.headers = None + self.url = 'http://pastebin.com/raw.php?i=' + self.id + + def get(self): + self.text = helper.curl(self.url) + +class HaveIBeen(Site): + def __init__(self): + super(HaveIBeen, self).__init__() + self.sleep = SLEEP_HAVEIBEEN + logging.info('[+] Started HaveIBeen') + self.feedURL = 'http://feeds.feedburner.com/HaveIBeenPwnedLatestPastes' + + def _parse(self): + try: + d = feedparser.parse(self.feedURL) + return d['entries'] + except Exception as e: + logging.error('[!] Feed Parser Error: %s'%(str(e))) + return None + + def update(self): + logging.debug('Retrieving HaveIBeenPwned ID\'s') + i=0 + + for entry in self._parse(): + l = entry['links'][0]['href'] + link = l.split(r'/') + paste = HaveIBeenPaste(link[3]) + if not self.hasSeen(paste): + i+=1 + self.put(paste) + logging.debug('HaveIBeenPwned Added URLs: ' + str(i)) + + + + +if __name__ == '__main__': + c = HaveIBeen() + c.update() \ No newline at end of file diff --git a/lib/Paste.py b/lib/Paste.py index bc379c6..ebe1072 100644 --- a/lib/Paste.py +++ b/lib/Paste.py @@ -2,22 +2,47 @@ import settings import logging import re +import time class Paste(object): - def __init__(self): + def __init__(self,id): ''' class Paste: Generic "Paste" object to contain attributes of a standard paste - ''' - self.emails = 0 - self.hashes = 0 + self.id = id + self.emails = [] + self.emails2 = [] + self.hashes = [] self.num_emails = 0 self.num_hashes = 0 self.text = None self.type = None self.sites = None self.db_keywords = 0.0 - + + def __eq__(self,comparePaste): + #logging.info('id %s compares to %s'%(self.id, comparePaste.id)) + return self.id == comparePaste.id + + def row(self): + return { + 'pid' : self.id, + 'text' : self.text, + 'emails' : self.emails, + 'hashes' : self.hashes, + 'num_emails' : self.num_emails, + 'num_hashes' : self.num_hashes, + 'type' : self.type, + 'db_keywords' : self.db_keywords, + 'url' : self.url, + "added":time.strftime("%c") + } + + def get(self): + #override this + logging.error('[@] Function Not Implemented in Subclass') + pass + def match(self): ''' Matches the paste against a series of regular expressions to determine if the paste is 'interesting' @@ -32,33 +57,64 @@ def match(self): ''' # Get the amount of emails - self.emails = list(set(regexes['email'].findall(self.text))) - self.hashes = regexes['hash32'].findall(self.text) - self.num_emails = len(self.emails) - self.num_hashes = len(self.hashes) - if self.num_emails > 0: - self.sites = list(set([re.search('@(.*)$', email).group(1).lower() for email in self.emails])) - for regex in regexes['db_keywords']: - if regex.search(self.text): - logging.debug('\t[+] ' + regex.search(self.text).group(1)) - self.db_keywords += round(1/float( - len(regexes['db_keywords'])), 2) - for regex in regexes['blacklist']: - if regex.search(self.text): - logging.debug('\t[-] ' + regex.search(self.text).group(1)) - self.db_keywords -= round(1.25 * ( - 1/float(len(regexes['db_keywords']))), 2) - if (self.num_emails >= settings.EMAIL_THRESHOLD) or (self.num_hashes >= settings.HASH_THRESHOLD) or (self.db_keywords >= settings.DB_KEYWORDS_THRESHOLD): - self.type = 'db_dump' - if regexes['cisco_hash'].search(self.text) or regexes['cisco_pass'].search(self.text): - self.type = 'cisco' - if regexes['honeypot'].search(self.text): - self.type = 'honeypot' - if regexes['google_api'].search(self.text): - self.type = 'google_api' - # if regexes['juniper'].search(self.text): self.type = 'Juniper' - for regex in regexes['banlist']: - if regex.search(self.text): - self.type = None - break - return self.type + try: + r = self.text.splitlines() + logging.debug("[*] Num Lines in text: %i"%(len(r))) + + if regexes['email'].search(self.text): + self.emails = regexes['email'].findall(self.text) + + if regexes['email2'].search(self.text): + self.emails2 = regexes['email2'].findall(self.text) + + self.hashes = regexes['hash32'].findall(self.text) + + self.num_emails = len(self.emails) + logging.debug("[*] Num Emails: %i"%(self.num_emails)) + + self.num_emails = len(self.emails2) + logging.debug("[*] Num Emails2: %i"%(self.num_emails)) + + self.num_hashes = len(self.hashes) + logging.debug("[*] Num Hashes: %i"%(self.num_hashes)) + + if self.num_emails > 0: + self.sites = list(set([re.search('@(.*)$', email).group(1).lower() for email in self.emails])) + logging.debug("[*] Num Sites: %i"%(len(self.sites))) + + for regex in regexes['db_keywords']: + if regex.search(self.text): + logging.debug('\t[+] ' + regex.search(self.text).group(1)) + self.db_keywords += round(1/float( + len(regexes['db_keywords'])), 2) + + for regex in regexes['blacklist']: + if regex.search(self.text): + logging.debug('\t[-] ' + regex.search(self.text).group(1)) + self.db_keywords -= round(1.25 * ( + 1/float(len(regexes['db_keywords']))), 2) + + if (self.num_emails >= settings.EMAIL_THRESHOLD) or (self.num_hashes >= settings.HASH_THRESHOLD) or (self.db_keywords >= settings.DB_KEYWORDS_THRESHOLD): + self.type = 'db_dump' + + if regexes['cisco_hash'].search(self.text) or regexes['cisco_pass'].search(self.text): + self.type = 'cisco' + + if regexes['honeypot'].search(self.text): + self.type = 'honeypot' + + if regexes['google_api'].search(self.text): + self.type = 'google_api' + + # if regexes['juniper'].search(self.text): self.type = 'Juniper' + for regex in regexes['banlist']: + if regex.search(self.text): + self.type = None + break + + logging.debug("[*] Type: %s"%(self.type)) + return self.type + + except Exception as e: + logging.error("[!] Error: %s"%(str(e))) + return None diff --git a/lib/Pastebin.py b/lib/Pastebin.py index e9656cd..94303a1 100644 --- a/lib/Pastebin.py +++ b/lib/Pastebin.py @@ -10,37 +10,40 @@ class PastebinPaste(Paste): def __init__(self, id): - self.id = id + super(PastebinPaste, self).__init__(id) self.headers = None self.url = 'http://pastebin.com/raw.php?i=' + self.id - super(PastebinPaste, self).__init__() - + + def get(self): + self.text = helper.curl(self.url) class Pastebin(Site): - def __init__(self, last_id=None): - if not last_id: - last_id = None - self.ref_id = last_id + """ + Pastebin will block your IP if you request more than 600 requests in 10 mins. This is per admin@pastebin.com + """ + def __init__(self): self.BASE_URL = 'http://pastebin.com' self.sleep = SLEEP_PASTEBIN super(Pastebin, self).__init__() - + logging.info('[+] Started PasteBin') + + def terminating(self): + #TODO: persist the seen queue + pass + + def parse(self): + return BeautifulSoup(helper.curl(self.BASE_URL + '/archive')).find_all( + lambda tag: tag.name == 'td' and tag.a and '/archive/' not in tag.a['href'] and tag.a['href'][1:]) + def update(self): '''update(self) - Fill Queue with new Pastebin IDs''' - logging.info('Retrieving Pastebin ID\'s') - results = BeautifulSoup(helper.download(self.BASE_URL + '/archive')).find_all( - lambda tag: tag.name == 'td' and tag.a and '/archive/' not in tag.a['href'] and tag.a['href'][1:]) - new_pastes = [] - if not self.ref_id: - results = results[:60] - for entry in results: + logging.debug('Retrieving Pastebin ID\'s') + i=0 + for entry in self.parse(): paste = PastebinPaste(entry.a['href'][1:]) - # Check to see if we found our last checked URL - if paste.id == self.ref_id: - break - new_pastes.append(paste) - for entry in new_pastes[::-1]: - logging.info('Adding URL: ' + entry.url) - self.put(entry) - def get_paste_text(self, paste): - return helper.download(paste.url) + if not self.hasSeen(paste): + #logging.info('Adding URL: ' + paste.url) + i+=1 + self.put(paste) + logging.debug('Pastebin Added URLs: ' + str(i)) + diff --git a/lib/Pastie.py b/lib/Pastie.py index da84e55..6a38b12 100644 --- a/lib/Pastie.py +++ b/lib/Pastie.py @@ -10,39 +10,37 @@ class PastiePaste(Paste): def __init__(self, id): - self.id = id + super(PastiePaste, self).__init__(id) self.headers = None self.url = 'http://pastie.org/pastes/' + self.id + '/text' - super(PastiePaste, self).__init__() + def get(self): + try: + self.text = BeautifulSoup(helper.curl(self.url)).pre.text + except Exception as e: + logging.error('[!] Beautiful Soup Error: %s'%(str(e))) + self.text = None class Pastie(Site): - def __init__(self, last_id=None): - if not last_id: - last_id = None - self.ref_id = last_id + def __init__(self): self.BASE_URL = 'http://pastie.org' self.sleep = SLEEP_PASTIE super(Pastie, self).__init__() + logging.info('[+] Started Pastie') + + def parse(self): + return [tag for tag in BeautifulSoup(helper.curl( + self.BASE_URL + '/pastes')).find_all('p', 'link') if tag.a] def update(self): '''update(self) - Fill Queue with new Pastie IDs''' - logging.info('Retrieving Pastie ID\'s') - results = [tag for tag in BeautifulSoup(helper.download( - self.BASE_URL + '/pastes')).find_all('p', 'link') if tag.a] - new_pastes = [] - if not self.ref_id: - results = results[:60] - for entry in results: + logging.debug('Retrieving Pastie ID\'s') + i=0 + for entry in self.parse(): paste = PastiePaste(entry.a['href'].replace( self.BASE_URL + '/pastes/', '')) - # Check to see if we found our last checked URL - if paste.id == self.ref_id: - break - new_pastes.append(paste) - for entry in new_pastes[::-1]: - logging.debug('Adding URL: ' + entry.url) - self.put(entry) + if not self.hasSeen(paste): + i+=1 + self.put(paste) + logging.debug('Pastie Added URLs: ' + str(i)) - def get_paste_text(self, paste): - return BeautifulSoup(helper.download(paste.url)).pre.text \ No newline at end of file diff --git a/lib/RegexMgr.py b/lib/RegexMgr.py new file mode 100644 index 0000000..f9c8d46 --- /dev/null +++ b/lib/RegexMgr.py @@ -0,0 +1,61 @@ +import re +from pymongo import MongoClient +from settings import USE_DB, DB_HOST, DB_PORT +import time +import logging + +import threading + +class RegexMgr(object): + """ + This class is intended to handle all the regex stuff and persistance to the DB for observers + """ + def __init__(self): + self.regexLock = threading.Lock() + + if USE_DB: + try: + self.client = MongoClient(DB_HOST, DB_PORT).paste_db.regexes + except pymongo.errors.ConnectionFailure, e: + logging.error('[!] Database failed to start %s'%(e)) + + self.customRegexes = [] + self._loadRegexes() + + + def _loadRegexes(self): + with self.regexLock: + cursor = self.client.find() + for row in cursor: + customRegex = {} + rc = re.compile(row['regex']) + customRegex['regex'] = rc + customRegex['user'] = row['user'] + customRegex['added'] = time.strftime("%c") + self.customRegexes.append(customRegex) + + logging.info("[+] Loaded custom regexes: %s"%(self.customRegexes)) + + def reloadCustomRegexes(self): + self.customRegexes = [] + self._loadRegexes() + + def add(self,regex, user): + if self.valid(regex): + o = {"user":user,"regex":regex,"added":time.strftime("%c")} + self.client.insert(o) + self.customRegexes.append(o) + return True + + def valid(self,regex): + try: + re.compile(regex) + is_valid = True + except re.error: + is_valid = False + return is_valid + + + + + \ No newline at end of file diff --git a/lib/Site.py b/lib/Site.py index 6ab60ef..e94e92c 100644 --- a/lib/Site.py +++ b/lib/Site.py @@ -5,10 +5,12 @@ from pymongo import MongoClient from requests import ConnectionError from twitter import TwitterError -from settings import USE_DB, DB_HOST, DB_PORT +from settings import USE_DB, DB_HOST, DB_PORT, SEEN_DEQUE_LEN import logging import helper +from random import randint +from collections import deque class Site(object): ''' @@ -31,19 +33,33 @@ class Site(object): # that I could find... So, I decided to implement my own queue with a few # changes def __init__(self, queue=None): + + # the double ended queue is used to check the last n URLs to see if they have been processed, since the URLs are random strings. + self.seen = deque(maxlen=SEEN_DEQUE_LEN) + if queue is None: self.queue = [] + if USE_DB: # Lazily create the db and collection if not present self.db_client = MongoClient(DB_HOST, DB_PORT).paste_db.pastes - + def addSeen(self,item): + self.seen.append(item) + #logging.info('[@] Site deque len %i'%(len(self.seen))) + + def hasSeen(self,item): + res = item in self.seen + #logging.info('[@] URL Seen %s %s'%(item.url,res)) + return res + def empty(self): return len(self.queue) == 0 def get(self): if not self.empty(): result = self.queue[0] + self.addSeen(result) del self.queue[0] else: result = None @@ -67,36 +83,50 @@ def clear(self): def list(self): print('\n'.join(url for url in self.queue)) - def monitor(self, bot, t_lock): + def parse(self): + #override this + logging.error('[@] Function Not Implemented in Subclass') + pass + + def update(self): + #override this + logging.error('[@] Function Not Implemented in Subclass') + pass + + def terminating(self): + #this can be overridden in subclass + logging.debug('[!] Terminating.....') + + def monitor(self, bot, isRunning): self.update() - while(1): + while isRunning.is_set(): while not self.empty(): + if not isRunning.is_set(): break + #need to sleep to avoid the ban.... + time.sleep(randint(2,5)) paste = self.get() - self.ref_id = paste.id - logging.info('[*] Checking ' + paste.url) - paste.text = self.get_paste_text(paste) + paste.get() tweet = helper.build_tweet(paste) if tweet: logging.info(tweet) - with t_lock: + with bot.tweetLock: if USE_DB: - self.db_client.save({ - 'pid' : paste.id, - 'text' : paste.text, - 'emails' : paste.emails, - 'hashes' : paste.hashes, - 'num_emails' : paste.num_emails, - 'num_hashes' : paste.num_hashes, - 'type' : paste.type, - 'db_keywords' : paste.db_keywords, - 'url' : paste.url - }) + try: + self.db_client.save(paste.row()) + except Exception as e: + logging.error('[!] MongoDB Error %s'%(str(e))) try: + logging.debug('[+] Tweet %s'%(tweet)) bot.statuses.update(status=tweet) - except TwitterError: - pass + except TwitterError as e: + logging.error('[!] TwitterError %s'%(str(e))) + if not isRunning.is_set(): break self.update() while self.empty(): logging.debug('[*] No results... sleeping') time.sleep(self.sleep) self.update() + + self.terminating() + + diff --git a/lib/Slexy.py b/lib/Slexy.py index 3876c81..b1c41d2 100644 --- a/lib/Slexy.py +++ b/lib/Slexy.py @@ -10,38 +10,34 @@ class SlexyPaste(Paste): def __init__(self, id): - self.id = id + super(SlexyPaste, self).__init__(id) self.headers = {'Referer': 'http://slexy.org/view/' + self.id} self.url = 'http://slexy.org/raw/' + self.id - super(SlexyPaste, self).__init__() - + + def get(self): + self.text = helper.curl(self.url, self.headers['Referer']) class Slexy(Site): - def __init__(self, last_id=None): - if not last_id: - last_id = None - self.ref_id = last_id + def __init__(self): self.BASE_URL = 'http://slexy.org' self.sleep = SLEEP_SLEXY super(Slexy, self).__init__() - + logging.info('[+] Started Slexy') + + def parse(self): + return BeautifulSoup(helper.curl(self.BASE_URL + '/recent')).find_all( + lambda tag: tag.name == 'td' and tag.a and '/view/' in tag.a['href']) + def update(self): '''update(self) - Fill Queue with new Slexy IDs''' - logging.info('[*] Retrieving Slexy ID\'s') - results = BeautifulSoup(helper.download(self.BASE_URL + '/recent')).find_all( - lambda tag: tag.name == 'td' and tag.a and '/view/' in tag.a['href']) - new_pastes = [] - if not self.ref_id: - results = results[:60] - for entry in results: + logging.debug('[*] Retrieving Slexy ID\'s') + + i=0 + for entry in self.parse(): paste = SlexyPaste(entry.a['href'].replace('/view/', '')) - # Check to see if we found our last checked URL - if paste.id == self.ref_id: - break - new_pastes.append(paste) - for entry in new_pastes[::-1]: - logging.info('[+] Adding URL: ' + entry.url) - self.put(entry) + if not self.hasSeen(paste): + i+=1 + self.put(paste) + logging.debug('Slexy Added URLs: ' + str(i)) + - def get_paste_text(self, paste): - return helper.download(paste.url, paste.headers) diff --git a/lib/Stats.py b/lib/Stats.py new file mode 100644 index 0000000..0c13e7f --- /dev/null +++ b/lib/Stats.py @@ -0,0 +1,83 @@ +from pymongo import MongoClient +from bson import Code +from twitter import TwitterError +from settings import USE_DB, DB_HOST, DB_PORT, STATS_FREQ + +import logging +import time + +class Stats(object): + def __init__(self): + if USE_DB: + try: + self.client = MongoClient(DB_HOST, DB_PORT).paste_db.pastes + except pymongo.errors.ConnectionFailure, e: + logging.error('[!] Database failed to start %s'%(e)) + #commenting this cache call out; taking too much time at start up. + #self.cacheEmail = self.uniqueEmailSet() + self.cacheEmail = None + + def uniqueEmailSet(self): + map = Code("function () {" + " this.emails.forEach(function(z) {" + " emit(z,1);" + " });" + "}") + reduce = Code("function (key,values) {" + "var total = 0;" + "for (var i = 0; i 0: @@ -63,7 +105,6 @@ def build_tweet(paste): tweet += ' Possible SSH private key' elif paste.type == 'honeypot': tweet += ' Dionaea Honeypot Log' - tweet += ' #infoleak' - if paste.num_emails > 0: - print(paste.emails) + tweet += ' #infosec #dataleak' + return tweet diff --git a/lib/regexes.py b/lib/regexes.py index 4d1e535..2486d25 100644 --- a/lib/regexes.py +++ b/lib/regexes.py @@ -2,6 +2,7 @@ regexes = { 'email': re.compile(r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}', re.I), + 'email2':re.compile(r'[\w\.-]+@[\w\.-]+'), #'ssn' : re.compile(r'\d{3}-?\d{2}-?\d{4}'), 'hash32': re.compile(r'[^