你有一个非常重要的问题。编写代码以便在文件1的末尾和文件2的开头找到重复的文本很容易。但是你不想删除重复的文本—-你想要
分裂
第二篇文章开始的地方。获得正确的分裂可能是棘手的 - 一个标记是全部大写,另一个标记是
BY
在下一行的开头。
它有助于从连续文件中获取示例,但下面的脚本适用于一个测试用例。
的
在尝试此代码之前,请备份所有文件。
</强>
代码
覆写
现有文件。
实施是在
LUA
。
该算法大致是:
忽略文件1末尾的空行和文件2的开头。
查找文件1末尾和文件2开头常用的长行序列。<UL>
<LI>
这可以通过尝试40行,然后是39行,依此类推
</LI>
</UL>
</LI>
<LI>
从两个文件中删除序列并调用它
<code>
overlap
</code>
。
</LI>
<LI>
在标题处拆分重叠
</LI>
<LI>
将重叠的第一部分附加到file1;将第二部分添加到file2。
</LI>
<LI>
使用行列表覆盖文件的内容。
</LI>
</醇>
这是代码:
#!/usr/bin/env lua
local ext = arg[1] == ‘-xxx’ and ‘.xxx’ or ‘’
if #ext > 0 then table.remove(arg, 1) endlocal function lines(filename)
local l = { }
for line in io.lines(filename) do table.insert(l, (line:gsub(‘’, ‘’))) end
assert(#l > 0, “No lines in file “ .. filename)
return l
endlocal function write_lines(filename, lines)
local f = assert(io.open(filename .. ext, ‘w’))
for i = 1, #lines do
f:write(lines[i], ‘\n’)
end
f:close()
endlocal function lines_match(line1, line2)
io.stderr:write(string.format(“%q ==? %q\n”, line1, line2))
return line1 == line2 — could do an approximate match here
endlocal function lines_overlap(l1, l2, k)
if k > #l2 or k > #l1 then return false end
io.stderr:write(‘* k = ‘, k, ‘\n’)
for i = 1, k do
if not lines_match(l2[i], l1[#l1 - k + i]) then
if i > 1 then
io.stderr:write(‘After ‘, i-1, ‘ matches: FAILED <====\n’)
end
return false
end
end
return true
endfunction find_overlaps(fname1, fname2)
local l1, l2 = lines(fname1), lines(fname2)
— strip trailing and leading blank lines
while l1[#l1]:find ‘^[%s]$’ do table.remove(l1) end
while l2[1] :find ‘^[%s]$’ do table.remove(l2, 1) end
local matchsize — # of lines at end of file 1 that are equal to the same
— # at the start of file 2
for k = math.min(40, #l1, #l2), 1, -1 do
if lines_overlap(l1, l2, k) then
matchsize = k
io.stderr:write(‘Found match of ‘, k, ‘ lines\n’)
break
end
endif matchsize == nil then
return false — failed to find an overlap
else
local overlap = { }
for j = 1, matchsize do
table.remove(l1) — remove line from first set
table.insert(overlap, table.remove(l2, 1))
end
return l1, overlap, l2
end
endlocal function split_overlap(l)
for i = 1, #l-1 do
if l[i]:match ‘%u’ and not l[i]:match ‘%l’ then — has caps but no lowers
— io.stderr:write(‘Looking for byline following ‘, l[i], ‘\n’)
if l[i+1]:match ‘^%s*BY%s’ then
local first = {}
for j = 1, i-1 do
table.insert(first, table.remove(l, 1))
end
— io.stderr:write(‘Split with first line at ‘, l[1], ‘\n’)
return first, l
end
end
end
endlocal function strip_overlaps(filename1, filename2)
local l1, overlap, l2 = find_overlaps(filename1, filename2)
if not l1 then
io.stderr:write(‘No overlap in ‘, filename1, ‘ an</code>