Source code for pytextprep.extract_ngram

[docs]def extract_ngram(tweets, n):
    """Extracts n-grams from the input tweet data
    
    Parameters
    ----------
    tweets : array_like
        List of tweets
    n : int
        Length of n-grams to be created

    Returns
    -------
    list
        List of n-grams generated

    Examples
    --------
    >>> tweets_list =[
        "Make America Great Again DonaldTrump",
    ]  
    >>> extract_ngram(tweets=tweets_list, n=3)
    [
        "Make America Great",
        "America Great Again",
        "Great Again DonaldTrump"
    ]
    """
    # Check for correct input type
    if not isinstance(tweets, list):
        raise TypeError("'tweets' should be of type 'list'.")

    # Convert array like input to string
    s =  " ".join(tweets)
    
    # Break tweets into individual words
    words = [word for word in s.split(" ") if word != ""]

    # Use the zip function to generate n-grams
    ngrams = zip(*[words[i:] for i in range(n)])

    # Concatenate and return ngrams
    return [" ".join(ngram) for ngram in ngrams]