How to build Word 2 Vector for Urdu language.

Vector representation of words

Word to vector is an important feature of a language model. It gives you insights into a language model and how effective that model is. There is no general w2v (word to vector) model for a language, you want to use. You need to build it for your language depending on the domain and usage for specific applications. You can build a large w2v model but it creates a heavy burden of performance and it will use a lot of resources in the applications. So, you need to choose the dataset accordingly and build a w2v model for that specific application. For example, if you are building a model for news, choose the news dataset and then build a w2v model using that dataset. In this article, I'm going to build a w2v model for freely available journalism dataset.

COUNTER (COrpus of Urdu News TExt Reuse):
This dataset is collected from journalism and can be used for Urdu NLP research. Here is the link to the resource for more information. This dataset is in XML format and you have to convert it to in a form that can be used for processing. I've written a Python script that reads and converts the XML file to a single txt file for further processing.

import plac
import codecs
import os
from xml.etree import ElementTree as ET


PATH = "path/to/COUNTER"

def chunks(l, n):
    """
    takes list and chunks them with n parameters.
    :param l: input list
    :param n: size of chunk
    :return: generator
    """
    n = max(1, n)
    return (l[i:i + n] for i in range(0, len(l), n))

def save_file(urdu_list):
    """
    save list to file
    :param urdu_list:
    :return: None
    """
    with codecs.open(PATH+"counter.txt", 'w', encoding='utf-8') as fp:
        for line in urdu_list:
            fp.write(line + "\n")


def read_from_xml_directory():
    """
    reads counter directory and saves it to a text file
    :return: None
    """
    urdu_list = []
    directory = os.listdir(PATH + '/corpus/COUNTER/')
    for xml in directory:
        xml_doc = ET.parse(PATH + '/corpus/COUNTER/' + xml)
        r = xml_doc.getroot()
        t = r[1].text
        list100 = list(chunks(t.split(" "), 100))
        for line in list100:
            urdu_list.append(" ".join(line))

    save_file(urdu_list)


def main():
    read_from_xml_directory()
    print("Done.")


if __name__ == '__main__':
    plac.call(main)

Now we have a txt file that can be used for w2v.

Here is the gensim script to build w2v model.

#!/usr/bin/env python
from __future__ import print_function, unicode_literals, division

import codecs
import io
import logging
import re
from os import path
import os
import plac
try:
    import ujson as json
except ImportError:
    import json
from gensim.models import Word2Vec
from preshed.counter import PreshCounter

logger = logging.getLogger(__name__)


class Corpus(object):
    def __init__(self, directory, min_freq=10):
        self.directory = directory
        self.counts = PreshCounter()
        self.strings = {}
        self.min_freq = min_freq

    def count_doc(self, doc):
        # Get counts for this document
        for word in doc:
            self.counts.inc(word.orth, 1)
        return len(doc)

    def __iter__(self):
        for text_loc in iter_dir(self.directory):
            with io.open(text_loc, 'r', encoding='utf8', errors='ignore') as file_:
                text = file_.read()
            yield text


def iter_dir(loc):
    for fn in os.listdir(loc):
        if path.isdir(path.join(loc, fn)):
            for sub in os.listdir(path.join(loc, fn)):
                yield path.join(loc, fn, sub)
        else:
            yield path.join(loc, fn)


@plac.annotations(
    in_dir="Location of input directory",
    out_dir="Location of output file",
    n_workers=("Number of workers", "option", "n", int),
    size=("Dimension of the word vectors", "option", "d", int),
    window=("Context window size", "option", "w", int),
    min_count=("Min count", "option", "m", int),
    negative=("Number of negative samples", "option", "g", int),
    nr_iter=("Number of iterations", "option", "i", int),
)
def main(in_dir, out_dir, negative=5, n_workers=4, window=5, size=100, min_count=5, nr_iter=2):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    corpus = Corpus(in_dir)
    sentences = []
    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
        file_sentences = []
        with codecs.open(text_loc, "r", encoding='utf-8', errors='ignore') as file_:
            lines = file_.readlines()
            for line in lines:
                text = re.sub(r"\d+", " ", line)
                # English punctuations
                text = re.sub(r"""[!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~]+""", " ", text)
                # Urdu punctuations
                text = re.sub(r"[:؛؟’‘٭ء،۔]+", " ", text)
                # Arabic numbers
                text = re.sub(r"[٠‎١‎٢‎٣‎٤‎٥‎٦‎٧‎٨‎٩]+", " ", text)
                text = re.sub(r"[^\w\s]", " ", text)
                # Remove English characters and numbers.
                text = re.sub(r"[a-zA-z0-9]+", " ", text)
                # remove multiple spaces.
                text = re.sub(r" +", " ", text)
                text = text.split(" ")
                # some stupid empty tokens should be removed.
                text = [t.strip() for t in text if t.strip()]
                file_sentences.extend(text)
        sentences.append(file_sentences)

    model = Word2Vec(
        sentences=sentences,
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        sample=1e-5,
        negative=negative,
        iter=5
    )
    model.train(sentences=sentences, total_examples=len(sentences), epochs=5)
    model.wv.save_word2vec_format(out_dir, binary=False)


if __name__ == '__main__':
    plac.call(main)

And that's pretty much it. You have a nice w2v model for the Urdu language. Use it, share it, spread it.

If you have any questions, plz don't ask :).