1
- #!/usr/bin/env python2.6
2
- '''
1
+ #!/usr/bin/env python
2
+ """
3
3
This file opens a docx (Office 2007) file and dumps the text.
4
4
5
- If you need to extract text from documents, use this file as a basis for your work.
5
+ If you need to extract text from documents, use this file as a basis for your
6
+ work.
6
7
7
8
Part of Python's docx module - http://github.com/mikemaccana/python-docx
8
9
See LICENSE for licensing information.
9
- '''
10
- from docx import *
10
+ """
11
+
11
12
import sys
12
- if __name__ == '__main__' :
13
+
14
+ from docx import opendocx , getdocumenttext
15
+
16
+ if __name__ == '__main__' :
13
17
try :
14
18
document = opendocx (sys .argv [1 ])
15
- newfile = open (sys .argv [2 ],'w' )
19
+ newfile = open (sys .argv [2 ], 'w' )
16
20
except :
17
- print ('Please supply an input and output file. For example:' )
18
- print (''' example-extracttext.py 'My Office 2007 document.docx' 'outputfile.txt' ''' )
21
+ print (
22
+ "Please supply an input and output file. For example:\n "
23
+ " example-extracttext.py 'My Office 2007 document.docx' 'outp"
24
+ "utfile.txt'"
25
+ )
19
26
exit ()
20
- ## Fetch all the text out of the document we just created
21
- paratextlist = getdocumenttext (document )
22
27
23
- # Make explicit unicode version
28
+ # Fetch all the text out of the document we just created
29
+ paratextlist = getdocumenttext (document )
30
+
31
+ # Make explicit unicode version
24
32
newparatextlist = []
25
33
for paratext in paratextlist :
26
- newparatextlist .append (paratext .encode ("utf-8" ))
27
-
28
- ## Print our documnts test with two newlines under each paragraph
34
+ newparatextlist .append (paratext .encode ("utf-8" ))
35
+
36
+ # Print out text of document with two newlines under each paragraph
29
37
newfile .write ('\n \n ' .join (newparatextlist ))
30
- #print '\n\n'.join(newparatextlist)
0 commit comments