Text Mining, Parsing and Formatting with Python (sample)

# -*- coding: utf-8 -*-

##################################################################################################################################
##       MULTIPURPOSE-KNOWLEDGE-CORPUS-PARSERv.1.0                ##
## Functions:                             ##
## 1. Runs through a directory containing the knowledge corpus, which is a collection of pdf files on a specific topic.  ##
## 2. Calls PDFMiner to handle these pdf files and extract their contents.              ##
## 3. Concatenates the extracted text, from the pdf files, into a single text file.           ##
## 4. Re-writes the extraction output to a new text file, in order to clean it from malformed or missrecognised characters. ##
## 5. In Linux as an optional function the script may use the shell and call treetagger to process the text file.    ##
##                                ##
##################################################################################################################################



from pdfminer.pdfinterp import PDFResourceManager, process_pdf
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
import os
import codecs
import re
import shlex
import subprocess
from subprocess import call
import sys


dirpath = '/path/to/directory/'
filenames = os.listdir(dirpath)

# pattern matcher for empty lines
# emptychar = re.compile("\0")

nb = 0

# define convert pdf function
def convert_pdf(path):

 rsrcmgr = PDFResourceManager()
 retstr = StringIO()
 codec = 'ascii'
 # Layaout analysis parameter: 
  # all texts: forces to perform layout analysis for all the text strings, including text contained in figures.
  # detect_vertical Allows vertical writing detection.
 laparams = LAParams(all_texts=False, detect_vertical=False)
 device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)

 fp = codecs.open(path, 'rb')
 # error handling in case of locked pdf
 try:
  process_pdf(rsrcmgr, device, fp)
 except:
  print "error : extraction not allowed for " + path

 fp.close()
 device.close()
 str = retstr.getvalue()
 retstr.close()
 # filetitle allows me to find the filename in the path
 filetitle = path[len(dirpath) : -4]
 # return a title tag
 return "<FILE = "+filetitle+" >\n\n" +str


# define cleaning function
def cleanFile(line):
   
 ex = re.compile(r'(\t| |\n|\r|\w|-|;|,|\.|:|\(|\)|\<|\>|\/)')
 table = ex.findall(line)
 line = ''.join(table)

 return line

# outfile opening
with codecs.open('/home/ubuntu/Desktop/Articles-pour-Nickolas/corpus.txt', 'w', encoding='latin-1') as outfile:

 for fname in filenames:
  # Check we only work with pdf files
  if fname.endswith(".pdf"):
   nb = nb+1
   # Console display
   print fname
   print nb
   currentfile = os.path.join(dirpath, fname)
   # handling extraction errors of Assertion type
   try:
    # calling convert pdf function on currentfile
    file = convert_pdf(currentfile)
   except AssertionError:
    print "file error " + fname
   # handling encoding errors (mostly with filenames)
   try:
    encodeFile = file.encode('latin-1','replace')
    outfile.write(encodeFile)
   except:
    print "please rename file " +fname+ " this filename contains encoding abnormalities!"
    
with codecs.open(u'/home/ubuntu/Desktop/Articles-pour-Nickolas/knowledge corpus.txt', 'w', encoding='utf-8') as outfile:
 
 with codecs.open(u'/home/ubuntu/Desktop/Articles-pour-Nickolas/corpus.txt', encoding='utf-8') as infile:
  
  for i,line in enumerate(infile):
   # Cleanfile function
   if len(line) > 7 :
    line = cleanFile(line)
    outfile.write(line)
   # else :
    # print i,
  


#Linux shell and treetagger optional feature
  
#subprocess.call(["ls"])
#print
#os.chdir('/home/ubuntu/Desktop/Treetagger')
#subprocess.call(["ls"])
#print

#with open ("knowledge corpus.txt", "rb") as infile, open("knowledge corpus.tt", "wb") as outfile:
 #subprocess.check_call(["/home/ubuntu/Desktop/Treetagger/cmd/tree-tagger-english-utf8"], 
       #stdin=infile, stdout=outfile)

   
##############################################################################
##                   ##
##      EXAMPLES, TEST AND NOTES AREA      ##
##                   ##
##############################################################################

 
 # bad characters handling TABLE is a more pythonic way than long unreadable strings of characters 
 # invalid_chars_table = ['^',
    # ' ', #\x01
    # ' ', #\x12
    # ' ', #\x13
    # ' ', #\x0e
    # ' ', #\x06
    # ' ', #\x0f
    # ' ', #\x02
    # ' ', #\x18
    # ' ', #\x04
    # ' ', #\x08
    # ' ', #\x05
    # ' ', #\x07
    # ' ', #\x15
    # ' ', #\x14
    # ' ', #\x03
    # ' ', #\x11
    # ' ', #\x10
    # ' ', #\x16
    # ' ', #\x0c
    # '\00'] #\x00
    
 

No comments:

Post a Comment


Free online chess

View Kapellas Nick's profile on LinkedIn
Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License