Tuesday 2 October 2018

Identifying sentence boundary in a paragraph for only fullstops - Python


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#open a file and clean its contents tokenize and identifyits sentence boundary using .

#!/usr/bin/python
import re

with open('raw_corpus.txt') as fp:
    lines = fp.read().split("\n")   #here lines contains entire file contents

#sentence incremental variable
i=1;

#to access file contents line by line
for line in lines:

#if empty break from current iteration
    if line == "":
        break

#convert to lowercase
   # line = line.lower()

#leaning
    line = re.sub(r'\.', " .", line) #substitute . with space .
    line = re.sub(r',', " ,", line) #substitute , with space ,
    line = re.sub(r'\?', " ?", line) #substitute ? with space ?
    line = re.sub(r'!', " !", line)  #substitute ! with space !

#replace multiple spaces into single spaces
    line = re.sub(r'\s+', " ", line)

#get words in current line
    if line != "" and line != " ":
        sentences = line.split('.')
        
        for sentence in sentences:
            #print ("Iam|",sentence,"|",sep='') #debugging statement
            if sentence !="" and sentence !=" ":

                words = sentence.split(' ')

                print ("<Sentence Id='",i,"'>",sep='')  #use sep='' to suppress white space while printing

                j=1   #token counter
                for word in words:
                    if word != "" and word !=" ":
                        print (j,"\t",word)
                        j=j+1
                print (j,"\t.",word)     
                print ("</Sentence>",sep='')
                i = i + 1                 #increment i 


This script opens 'raw_coprus.txt', reads its contents line by line.

Then splits each line using '.' which is identified as a sentence boundary. Each sentence is now been split into tokens using space. These tokens are incremented for each sentence and printed along with current sentence.

Finding word frequency in Python - Dictionary


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#Program to read a file(corpus) and find frequency of each token


#!/usr/bin/python
import re

#read file 
file=open("raw_corpus.txt","r+")


#dictionary to save tokens as keys and values fruquency as values
wordcount={}

for word in file.read().split():
#split() will split according to whitespace that includes ' ' '\t' and '\n'. It will split all the values into one list.
    #print (word)

    #cleaning corpus
    word = word.lower() #convert to lowercase
    word = re.sub('\.', "", word) #substitute . with empty

    #check if current token already exists in dictionary
    if word not in wordcount:
        wordcount[word] = 1
    else:
        wordcount[word] += 1


#print the dictionary with keys and values
#for k,v in wordcount.items():
    #print (k, v)

#print the dictionary with sorted keys(tokens) and values
for k in sorted(wordcount):
    print (k, wordcount[k])

This script opens the file 'raw_corpus.txt', each lines is split into words. Each word is stored in dictionary, with key as word and value as the frequency. When the same word(key) is encountered again value is incremented by 1.


Finding character frequency using Python - Dictionary


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
#open a file and find its character frequency
with open('raw_corpus.txt') as fp:
    lines = fp.read().split("\n")   #here lines contains entire file contents

#incremental variable
i=1;

#dictionary to save characters as keys and values as fruquency
charcount={}


#to access file contents line by line
for line in lines:

    #convert to lowercase
    lower_line = line.lower()

    chars = lower_line

    #for loop to access current line characters 
    for char in chars:
        if char not in charcount:
            charcount[char] = 1
        else:
            charcount[char] += 1
    
    #print (i,"\t",lower_line)
    
    i = i + 1                 #increment i 


#print the dictionary with sorted keys(tokens) and values
for k in sorted(charcount):
    print (k, charcount[k])

This script will open the file 'raw_corpus.txt' read its contents line by line, then find each character frequency and store in dictionary.

Dictionary in Python is similar to hashes in Perl. It stores a values for each corresponding key, duplicate keys are overridden when a same key is encountered while storing.

Tokenization and sentence boundary assuming each sentence is in new line


 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#open a file and clean its contents

#!/usr/bin/python
import re

with open('raw_corpus.txt') as fp:
    lines = fp.read().split("\n")   #here lines contains entire file contents

#incremental variable
i=1;

#to access file contents line by line
for line in lines:

#if empty break from current iteration
    if line == "":
        break

#convert to lowercase
    line = line.lower()

#leaning
    line = re.sub(r'\.', " .", line) #substitute . with space .
    line = re.sub(r',', " ,", line) #substitute , with space ,
    line = re.sub(r'\?', " ?", line) #substitute ? with space ?
    line = re.sub(r'!', " !", line)  #substitute ! with space !

#replace multiple spaces into single spaces
    line = re.sub(r'\s+', " ", line)

#get words in current line
    words = line.split(' ')

    print ("<Sentence Id='",i,"'>",sep='')  #use sep='' to suppress white space while printing

    j=1   #token counter
    for word in words:
        print (j,"\t",word)
        j=j+1
        
    
    print ("</Sentence>",sep='')
    
    i = i + 1                 #increment i 

This script will open 'raw_corpus.txt' and remove junks.

It will also print sentence boundaries and tokenize a sentence into words

open file using 'open mode'


1
2
3
4
5
6
7
8
9
#open file using open file mode
fp = open('raw_corpus.txt') # Open file on read mode
lines = fp.read().split("\n") # Create a list containing all lines
fp.close() # Close file


#read file line by line
for line in lines:
    print (line)


This script will print contents of file 'raw_corpus.txt' line by line.

Reading a file using "with"


1
2
3
4
5
6
7
8
#file open example using "with" (recomemded)
with open('raw_corpus.txt') as fp:
    lines = fp.read().split("\n")   #here lines contains entire file contents


#to access file contents line by line
for line in lines:
    print (line)

When you run this python script, the contents of the file 'raw_corpus.txt' are printed line by line.