Sunday 2 June 2019

Parse data from unstructured text file in java

package main.java;

import java.io.File;
import java.io.FileNotFoundException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ParseFile {
 
 public static final Pattern VALID_EMAIL_ADDRESS_REGEX = Pattern.compile("[a-z.0-9-]{1,30}@[a-z0-9-]{1,65}.[a-z]{1,}", Pattern.CASE_INSENSITIVE);
 public static final Pattern VALID_URL_REGEX = Pattern.compile("(https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:,.;]*[-a-zA-Z0-9+&@#/%=~_|]", Pattern.CASE_INSENSITIVE);
 public static final Pattern VALID_DATE_REGEX = Pattern.compile("[0-3]?[0-9]/[0-3]?[0-9]/(?:[0-9]{2})?[0-9]{2}|[0-3]?[0-9]-[0-3]?[0-9]-(?:[0-9]{2})?[0-9]{2}", Pattern.CASE_INSENSITIVE);
 public static final Pattern VALID_TIME_REGEX = Pattern.compile("([2][0-3]|[0-1][0-9]|[1-9]).[0-5][0-9].([0-5][0-9]|[6][0])", Pattern.CASE_INSENSITIVE);

 public static void main(String[] args)
 {
  String dir = System.getProperty("user.dir");
  File file = new File(dir + "\\src\\main\\resources\\Test-RegexText.txt");
  List emailList = new ArrayList();
  List urlList = new ArrayList();
  List dateList = new ArrayList();
  List timeList = new ArrayList();
  Scanner sc;      
  Matcher matcher = null;
      
  try 
  {
   sc = new Scanner(file);
   while (sc.hasNextLine()){
    matcher = VALID_EMAIL_ADDRESS_REGEX.matcher(sc.nextLine());
    if(matcher.find())
    {
     emailList.add(matcher.group());
    }
   }
   
   sc = new Scanner(file);
   while (sc.hasNextLine())
   {
    matcher = VALID_URL_REGEX.matcher(sc.nextLine());
    if(matcher.find())
    {
     urlList.add(matcher.group());
    }
   }
   
   sc = new Scanner(file);
   while (sc.hasNextLine())
   {
    matcher = VALID_DATE_REGEX.matcher(sc.nextLine());
    if(matcher.find())
    {
     dateList.add(matcher.group());
    }
   }
   
   sc = new Scanner(file);
   while (sc.hasNextLine())
   {
    matcher = VALID_TIME_REGEX.matcher(sc.nextLine());
    if(matcher.find())
    {  
     final SimpleDateFormat sdf = new SimpleDateFormat("HH.mm.ss");
     try 
     {
      timeList.add(new SimpleDateFormat("K:mm a").format(sdf.parse(matcher.group())));
     } 
     catch (ParseException e) 
     {
      
     }
    }
  } 
   
   emailList.forEach((value)->System.out.println("Email : " + value));
   urlList.forEach((value)->System.out.println("Url : " + value));
   dateList.forEach((value)->System.out.println("Date : " + value));
   timeList.forEach((value)->System.out.println("Time : " + value));
   
  }
  catch (FileNotFoundException e) 
  {
   e.printStackTrace();
  }
 }
}
 
// download full source code from here.
Download Now

No comments:

Post a Comment