Skip to content

Commit 8655a72

Browse files
committed
initial commit
0 parents  commit 8655a72

16 files changed

+629
-0
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
mboxes/*
2+
*.pyc

jobs/__init__.py

Whitespace-only changes.

jobs/management/__init__.py

Whitespace-only changes.

jobs/management/commands/__init__.py

Whitespace-only changes.
+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
import re
3+
import urllib
4+
import mailbox
5+
6+
from django.conf import settings
7+
from django.core.management.base import BaseCommand
8+
9+
from jobs.models import JobEmail
10+
11+
mbox_dir = os.path.join(settings.PROJECT_DIR, "mboxes")
12+
13+
class Command(BaseCommand):
14+
15+
def handle(self, *args, **options):
16+
for mbox in mboxes():
17+
for msg in mailbox.mbox(mbox):
18+
print msg['content-type']
19+
email = JobEmail.new_from_msg(msg)
20+
if email:
21+
print "loaded %s" % email
22+
23+
def mboxes():
24+
if not os.path.isdir(mbox_dir):
25+
os.mkdir(mbox_dir)
26+
download_mboxes()
27+
for filename in os.listdir(mbox_dir):
28+
if filename.endswith("mbox"):
29+
yield os.path.join(mbox_dir, filename)
30+
31+
def download_mboxes():
32+
print "downloading code4lib mboxes"
33+
opener = urllib.URLopener()
34+
url = "http://serials.infomotions.com/code4lib/etc/mboxes/code4lib-%s.mbox"
35+
for year in range(2004, 2012):
36+
mbox_url = url % year
37+
mbox_file = os.path.join(mbox_dir, "code4lib-%s.mbox" % year)
38+
print "saving %s as %s" % (mbox_url, mbox_file)
39+
opener.retrieve(mbox_url, mbox_file)
40+

jobs/management/commands/nnp.py

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from django.core.management.base import BaseCommand
2+
3+
from jobs.models import JobEmail
4+
5+
class Command(BaseCommand):
6+
7+
def handle(self, *args, **options):
8+
for email in JobEmail.objects.all():
9+
for n in email.proper_nouns():
10+
print n.lower().encode('utf-8')

jobs/management/commands/pop.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import os
2+
import email
3+
import poplib
4+
import logging
5+
6+
from django.conf import settings
7+
from django.core.management.base import BaseCommand
8+
9+
from jobs.models import JobEmail
10+
11+
log = logging.getLogger(__name__)
12+
13+
class Command(BaseCommand):
14+
15+
def handle(self, *args, **options):
16+
log.info("checking for new emails")
17+
gmail = poplib.POP3_SSL(settings.POP_SERVER, settings.POP_PORT)
18+
gmail.user(settings.POP_USER)
19+
gmail.pass_(settings.POP_PASSWORD)
20+
21+
num_messages = len(gmail.list()[1])
22+
for i in range(num_messages):
23+
email_txt = '\n'.join(gmail.retr(i+1)[1])
24+
msg = email.message_from_string(email_txt)
25+
e = JobEmail.new_from_msg(msg)
26+
if e:
27+
log.info("found a new job email: %s", e)

jobs/models.py

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import re
2+
import time
3+
import codecs
4+
import rfc822
5+
import datetime
6+
import StringIO
7+
8+
import nltk
9+
10+
from django.db import models
11+
12+
class JobEmail(models.Model):
13+
from_name = models.CharField(max_length=255)
14+
from_address = models.CharField(max_length=255)
15+
from_domain = models.CharField(max_length=255)
16+
subject = models.TextField()
17+
body = models.TextField()
18+
sent_time = models.DateTimeField()
19+
message_id = models.CharField(max_length=1024)
20+
21+
def __str__(self):
22+
return "%s -%s" % (self.from_address, self.subject)
23+
24+
def proper_nouns(self):
25+
nouns = []
26+
for tag in self.tags():
27+
word = tag[0]
28+
is_proper_noun = tag[1] == "NNP"
29+
is_word = re.match("^[a-z]+$", tag[0], re.IGNORECASE)
30+
31+
if is_proper_noun and is_word:
32+
nouns.append(tag[0])
33+
elif len(nouns) > 0:
34+
yield " ".join(nouns)
35+
nouns = []
36+
37+
def tags(self):
38+
words = nltk.word_tokenize(self.body)
39+
return nltk.pos_tag(words)
40+
41+
42+
@classmethod
43+
def new_from_msg(klass, msg):
44+
if not is_job(msg):
45+
return None
46+
47+
if JobEmail.objects.filter(message_id=msg['message-id']).count() == 1:
48+
return None
49+
50+
e = JobEmail()
51+
e.from_name, e.from_address = rfc822.parseaddr(msg['from'])
52+
e.from_name = normalize_name(e.from_name)
53+
e.from_address = e.from_address.lower()
54+
e.from_domain = e.from_address.split('@')[1]
55+
e.subject = msg['subject']
56+
e.message_id = msg['message-id']
57+
e.body = get_body(msg)
58+
59+
t = time.mktime(rfc822.parsedate(msg['date']))
60+
e.sent_time = datetime.datetime.fromtimestamp(t)
61+
62+
if not e.body:
63+
return None
64+
65+
e.save()
66+
return e
67+
68+
def normalize_name(name):
69+
if ',' in name:
70+
parts = name.split(',')
71+
parts = [p.strip() for p in parts]
72+
first_name = parts.pop()
73+
parts.insert(0, first_name)
74+
name = ' '.join(parts)
75+
return name
76+
77+
def is_job(msg):
78+
if not msg['subject']:
79+
return False
80+
subject = msg['subject'].lower()
81+
if re.search('^re:', subject):
82+
return False
83+
if re.search('job', subject):
84+
return True
85+
if re.search('position', subject):
86+
return True
87+
return False
88+
89+
def get_body(msg):
90+
charset = msg.get_content_charset()
91+
92+
if not charset:
93+
return None
94+
95+
try:
96+
codec = codecs.getreader(charset)
97+
except LookupError:
98+
return None
99+
100+
payload = StringIO.StringIO(msg.get_payload())
101+
reader = codec(payload)
102+
body = "\n".join(reader.readlines())
103+
return body

jobs/tests.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import email
2+
import unittest
3+
4+
from jobs.models import JobEmail
5+
6+
class JobsTests(unittest.TestCase):
7+
8+
def test_email(self):
9+
msg = email.message_from_file(open("test-data/job-email"))
10+
e = JobEmail.new_from_msg(msg)
11+
self.assertEqual(e.from_address, "[email protected]")
12+
self.assertEqual(e.from_domain, 'miami.edu')
13+
self.assertEqual(e.from_name, 'Cheryl A. Gowing')
14+
self.assertEqual(e.subject, '[CODE4LIB] Job Posting: Head of Web & Emerging Technologies, University of Miami - revised')
15+
self.assertTrue('collaborates' in e.body)
16+
self.assertTrue(e.message_id, '<7933CD19EEFCC94392323A994F6F1EDF01DBB52AE8@MBX03.cgcent.miami.edu>')
17+
18+
def test_tagging(self):
19+
msg = email.message_from_file(open("test-data/job-email"))
20+
e = JobEmail.new_from_msg(msg)
21+
print e.tags()

jobs/views.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Create your views here.

logs/.keep

Whitespace-only changes.

manage.py

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/usr/bin/env python
2+
from django.core.management import execute_manager
3+
import imp
4+
try:
5+
imp.find_module('settings') # Assumed to be in the same directory.
6+
except ImportError:
7+
import sys
8+
sys.stderr.write("Error: Can't find the file 'settings.py' in the directory containing %r. It appears you've customized things.\nYou'll have to run django-admin.py, passing it your settings module.\n" % __file__)
9+
sys.exit(1)
10+
11+
import settings
12+
13+
if __name__ == "__main__":
14+
execute_manager(settings)

requirments.pip

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
django
2+
PyYAML
3+
nltk

0 commit comments

Comments
 (0)