# Created by Matt Asher for statisticsblog.com
# Feel free to share and modify so long as this header remains

# Text file to use to generate transition probabilities
sourceText = "/path/to/source.txt"

# Should we write the result back to a text file?
saveOutput = T

# Output text file
resultTxt = "/path/to/output.txt"

# Number of characters for the fake text
newTxtLength = 10^5

# Turn source file into a vector of individual characters
txt = readChar(sourceText, file.info(sourceText)$size)
txt = strsplit(txt, "", fixed=TRUE)[[1]]
lenM1 = length(txt) - 1

# Get all the unique characters which appear in the source text
uniqueChars = unique(txt)

# Create main transition matrix
tMat = diag(0, length(uniqueChars)) 

# Begin main iteratrion over vestor
for(i in 1:lenM1) {
	curr = txt[i]
	upcoming = txt[(i+1)]
	rowIndex = which(uniqueChars==curr)
	colIndex = which(uniqueChars==upcoming)
	tMat[rowIndex, colIndex] = tMat[rowIndex, colIndex] + 1
}

# Convert counts to frequencies (note, this will generally be a sparse matrix)
tMat = tMat/sum(tMat)

newTxt = rep(0, newTxtLength)

# Seed character to begin fake text. You could also pick one randomly
newTxt[1] = "\n"

for(j in 2:newTxtLength) {
	
	# Look at the corresponding row of the matrix
	transitionsRowIndex = which(uniqueChars == newTxt[(j-1)])
	tFreq = tMat[transitionsRowIndex,]
	
	# Pick a new character based on transition probabilities
	newTxt[j] = sample(uniqueChars, 1, prob=tFreq)
	
}

# Collapse it all into a single string
newTxt = paste(newTxt, collapse="")

if(saveOutput) {
	# Save this new text file
	fileConn<-file(resultTxt)
	writeLines(newTxt, fileConn)
	close(fileConn)
}


