数字内容安全之文本信息检索

    xiaoxiao2022-07-07  202

    文本信息检索

    实验要求

    针对“语料库.txt”文件,实现基于布尔模型的检索系统和基于TF-IDF的检索系统。(每一行看做是一个文档)

    基于布尔模型的检索系统

    输入 以布尔表达式形式输入,一次检索单词数量不超过三个(AA and BB or CC ),返回所有的击中的结果(超过10项的只显示前10项)

    基于TF-IDF的检索系统

    输入不超过8个字的短语,系统首先自动进行分词,按照TF-IDF的值求和排序返回前10项结果。

    实验过程

    由于两次实验都需要用将 语料库 的每一行看作一个文本,故一开始便将语料库按行分类存储到 text 集合和 listLine 集合中:

    //表示语料库中第i行文本下的分词结果,即:第i行文本中,string 词语在该行本文出现的次数 private Map<Integer, Map<String, Integer>> text = new HashMap<>(); //表示语料库第i行的文本内容 private List<String> listLine = new ArrayList<>();

    该部分代码与实验三FMM和BMM读取部分代码类似,故省略。

    基于布尔模型的检索系统

    由于输入的为汉字和 and or not 代表逻辑关系的表达式,而Java支持脚本语言,故使用 JavaSE6 中自带了JavaScript语言的脚本引擎 ScriptEngine:

    // 字符串转条件表达式 ScriptEngineManager manager = new ScriptEngineManager(); ScriptEngine engine = manager.getEngineByExtension("js"); // 输入input文本格式处理:如输入 ***中国 and 的 not 共产党*** String[] words = input.trim().split(" +"); String[] word3 = new String[3]; for (int i = 0; i < words.length; i+=2) { //将输入的单词存入 word3 数组中 word3[i/2] = words[i]; //将输入的 words[i] 替换为 --> 判断word文本行里是否包含words[i]这一单词 words[i] = "word.containsKey(word3[" + i/2 + "])"; } //将words数组用空格隔开 input = String.join(" ", words); //将用户输入的逻辑 替换为 --> 计算机可以识别的逻辑符号 input = input.replaceAll(" and ", " && "); input = input.replaceAll(" or ", " || "); input = input.replaceAll(" not ", " && ! "); //脚本语言的绑定 engine.put("word3", word3);

    然后对每行文本进行执行,符合条件表达式即输出文本:

    Boolean result = (Boolean) engine.eval(input);// 字符串转条件表达式 if (result) { System.out.println(listLine.get(i));//符合就输出相应文本 num ++; } if (num == 10) break;//输出前十行
    输出结果:

    基于TF-IDF的检索系统

    1、读取实验三对语料库的分词结果output.txt,利用BMM对输入的数据进行分词:
    for (int j = line.length(); j >= 0 ; j--) { for(int i=0; i<j; i++) { String string = line.substring(i, j); if (yuliaoku.containsKey(string) && !string.equals(" ")) { inputWords.add(string); j -= string.length()-1; break; } else if (i == j-1 && !string.equals(" ")) { inputWords.add(string); j -= string.length()-1; } } }
    2、获取TF值:
    /*********获取TF值---第i个网页下,分词结果的第j个词语的TF值********/ Map<Integer, Map<Integer, Double>> wordTF = new HashMap<>(); for (int i = 0; i < listLine.size(); i++) { //获取第i个网页下的分词结果 Map<String, Integer> map = text.get(i); //存储分词结果的第j个词语的TF值 Map map1 = new HashMap(); for (int j = 0; j < inputWords.size(); j++) { int num = map.containsKey(inputWords.get(j)) ? map.get(inputWords.get(j)) : 0; double aaa = (double)num / map.size(); map1.put(j, aaa); wordTF.put(i, map1); } }
    3、获取IDF值 ------ 网页总数量 / 当前词语出现的网页数量,再取log:
    double[] wordIDF = new double[inputWords.size()]; for (int i = 0; i < inputWords.size(); i++) { int idf = 0; for (int j = 0; j < listLine.size(); j++) { if (listLine.get(j).contains(inputWords.get(i))) { idf ++; } } if (idf == 0) { System.out.println( idf + "在所有网页中未出现,无法进行 网页数/词频 运算!"); System.exit(-1); } double aaa = listLine.size() / idf; wordIDF[i] = Math.log(aaa); }
    4、对每个部分求和:
    //得到每个部分的TF-IDF值 double result = 0.0; for (int j = 0; j < inputWords.size(); j++) { Double map = wordTF.get(i).get(j); result += map * wordIDF[j]; }
    5、排序输出:
    List<Map.Entry<Integer, Double>> entryList = new ArrayList<>(tf_idf.entrySet()); Collections.sort(entryList, new Comparator<Map.Entry<Integer, Double>>() { @Override public int compare(Map.Entry<Integer, Double> o1, Map.Entry<Integer, Double> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (int i = 0; i < 10; i++) { int a = entryList.get(i).getKey(); System.out.println("文本为:" + listLine.get(a)); System.out.println("TF-IDF值为:" + entryList.get(i).getValue()); System.out.println("---------------------------------------------------------------------------------------------------------");//分隔符 }
    输出结果:

    实验完整代码

    package com.tuxiangchuli.java; import javax.script.ScriptEngine; import javax.script.ScriptEngineManager; import javax.script.ScriptException; import java.io.*; import java.util.*; public class BooleanSearch { private Map<Integer, Map<String, Integer>> text = new HashMap<>(); //行数 private List<String> listLine = new ArrayList<>(); private Map<Integer, Integer> textNum = new HashMap<>(); private Map<String, Integer> yuliaoku = new HashMap<>(); private void readOutput() throws IOException { String fileName = "D:\\output.txt"; //FileInputStream字节输入流;InputStreamReader将字节流转化为字符流;BufferedReader缓冲方式读取 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8")); String line; while ((line = br.readLine()) != null) { String[] input = line.trim().split(" +"); if (input.length == 2) continue; yuliaoku.put(input[0], Integer.valueOf(input[2])); } br.close(); } private void readText() throws IOException { /********读取语料库每一行,存到List中********/ String fileName = "D:\\yuliaoku.txt"; //FileInputStream字节输入流;InputStreamReader将字节流转化为字符流;BufferedReader缓冲方式读取 BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8")); String line; while ((line = br.readLine()) != null) { listLine.add(line); } br.close(); for (int i = 0; i < listLine.size(); i++) { Map<String, Integer> words = new HashMap<>(); String[] arrays = listLine.get(i).trim().split(" +");//每个文本分词并统计其词频 textNum.put(i, arrays.length); for (String s : arrays) { if (s.indexOf("19980") != -1) continue; String[] endString = s.split("/"); s = endString[0]; int count = words.containsKey(s) ? words.get(s) : 0; words.put(s, count + 1); } text.put(i, words); } } private void tf_IDFSearch(String line) throws IOException { readOutput(); List<String> inputWords = new ArrayList<>(); //BMM分词 for (int j = line.length(); j >= 0 ; j--) { for(int i=0; i<j; i++) { String string = line.substring(i, j); if (yuliaoku.containsKey(string) && !string.equals(" ")) { inputWords.add(string); j -= string.length()-1; break; } else if (i == j-1 && !string.equals(" ")) { inputWords.add(string); j -= string.length()-1; } } } /*********获取TF值---第i个网页,第j个词的TF值********/ Map<Integer, Map<Integer, Double>> wordTF = new HashMap<>(); for (int i = 0; i < listLine.size(); i++) { Map<String, Integer> map = text.get(i); Map map1 = new HashMap(); for (int j = 0; j < inputWords.size(); j++) { int num = map.containsKey(inputWords.get(j)) ? map.get(inputWords.get(j)) : 0; double aaa = (double)num / map.size(); map1.put(j, aaa); wordTF.put(i, map1); } } /**********获取IDF值**********/ System.out.println("------------------------------------------"); System.out.println("输入的分词:" + inputWords); System.out.println("------------------------------------------"); double[] wordIDF = new double[inputWords.size()]; for (int i = 0; i < inputWords.size(); i++) { int idf = 0; for (int j = 0; j < listLine.size(); j++) { if (listLine.get(j).contains(inputWords.get(i))) { idf ++; } } if (idf == 0) { System.out.println( idf + "在所有网页中未出现,无法进行 网页数/词频 运算!"); System.exit(-1); } double aaa = listLine.size() / idf; wordIDF[i] = Math.log(aaa); } /************第i个网页获得的TF-IDF计算如下:******/ Map<Integer, Double> tf_idf = new HashMap<>(); for (int i = 0; i < listLine.size(); i++) { double result = 0.0; for (int j = 0; j < inputWords.size(); j++) { Double map = wordTF.get(i).get(j); result += map * wordIDF[j]; } tf_idf.put(i, result); } List<Map.Entry<Integer, Double>> entryList = new ArrayList<>(tf_idf.entrySet()); Collections.sort(entryList, new Comparator<Map.Entry<Integer, Double>>() { @Override public int compare(Map.Entry<Integer, Double> o1, Map.Entry<Integer, Double> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (int i = 0; i < 10; i++) { int a = entryList.get(i).getKey(); System.out.println("文本为:" + listLine.get(a)); System.out.println("TF-IDF值为:" + entryList.get(i).getValue()); System.out.println("---------------------------------------------------------------------------------------------------------"); } } private void boolSearch(String input) throws ScriptException { // 字符串转条件表达式 ScriptEngineManager manager = new ScriptEngineManager(); ScriptEngine engine = manager.getEngineByExtension("js"); String[] words = input.trim().split(" +"); String[] word3 = new String[3]; for (int i = 0; i < words.length; i+=2) { word3[i/2] = words[i]; words[i] = "word.containsKey(word3[" + i/2 + "])"; } input = String.join(" ", words); input = input.replaceAll(" and ", " && "); input = input.replaceAll(" or ", " || "); input = input.replaceAll(" not ", " && ! "); engine.put("word3", word3); int num = 0; for (int i=0; i<text.size(); i++) { Map<String, Integer> word; word = text.get(i); engine.put("word", word); Boolean result = (Boolean) engine.eval(input);// 字符串转条件表达式 if (result) { //符合就输出相应文本 System.out.println(listLine.get(i)); num ++; } if (num == 10) break; } } public static void main(String[] args) throws IOException, ScriptException { BooleanSearch booleanSearch = new BooleanSearch(); booleanSearch.readText(); Scanner scanner = new Scanner(System.in); String input; while (scanner.hasNextLine()) { input = scanner.nextLine(); input.getBytes("UTF-8"); /*****布尔检索*******/ //booleanSearch.boolSearch(input);//需要用到撤销注释 /*****基于TF-IDF的检索系统*******/ booleanSearch.tf_IDFSearch(input); } } }
    最新回复(0)